diff options
author | Emmanuel Gil Peyrot <linkmauve@linkmauve.fr> | 2017-08-04 13:50:47 +0200 |
---|---|---|
committer | Emil Velikov <emil.l.velikov@gmail.com> | 2017-08-17 16:46:48 +0100 |
commit | 3aece9f7688fc058e981cb4979caaf2b6b6039e7 (patch) | |
tree | e1c34fa0f593c21d54d1586b6fdba50c791c29ba /shaders | |
parent | 4262876d7cedc1214fbf8c5aecb2290c2711ecfc (diff) |
shaders: Add Dolphin’s übershaders.
These shaders have been generated by Dolphin 9649494f67 on Mesa
8c26b52349 for an HD4000 GPU.
They include a lot of uniform branches, mostly on integers, as well as
switch statements branching on small and bounded integers.
Signed-off-by: Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
Diffstat (limited to 'shaders')
27 files changed, 33534 insertions, 0 deletions
diff --git a/shaders/dolphin/ubershaders/102.shader_test b/shaders/dolphin/ubershaders/102.shader_test new file mode 100644 index 0000000..d7cb63a --- /dev/null +++ b/shaders/dolphin/ubershaders/102.shader_test @@ -0,0 +1,1258 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 3u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 3 texgens, early-depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +FORCE_EARLY_Z; +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/111.shader_test b/shaders/dolphin/ubershaders/111.shader_test new file mode 100644 index 0000000..205246b --- /dev/null +++ b/shaders/dolphin/ubershaders/111.shader_test @@ -0,0 +1,1268 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 4u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 4 texgens +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/12.shader_test b/shaders/dolphin/ubershaders/12.shader_test new file mode 100644 index 0000000..d61a2c3 --- /dev/null +++ b/shaders/dolphin/ubershaders/12.shader_test @@ -0,0 +1,961 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 0 texgens, per-pixel depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +#define depth gl_FragDepth +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // ZFreeze + if ((bpmem_genmode & 524288u) != 0u) { + float2 screenpos = rawpos.xy * cefbscale.xy; + // Opengl has reversed vertical screenspace coordiantes + screenpos.y = 528.0 - screenpos.y; + zCoord = int(czslope.z + czslope.x * screenpos.x + czslope.y * screenpos.y); + } + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // If early depth is enabled, write to zbuffer before depth textures + // If early depth isn't enabled, we write to the zbuffer here + int zbuffer_zCoord = bpmem_late_ztest ? zCoord : early_zCoord; + depth = float(zbuffer_zCoord) / 16777216.0; + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/120.shader_test b/shaders/dolphin/ubershaders/120.shader_test new file mode 100644 index 0000000..a10c631 --- /dev/null +++ b/shaders/dolphin/ubershaders/120.shader_test @@ -0,0 +1,1281 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 4u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 4 texgens, per-pixel depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +#define depth gl_FragDepth +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // ZFreeze + if ((bpmem_genmode & 524288u) != 0u) { + float2 screenpos = rawpos.xy * cefbscale.xy; + // Opengl has reversed vertical screenspace coordiantes + screenpos.y = 528.0 - screenpos.y; + zCoord = int(czslope.z + czslope.x * screenpos.x + czslope.y * screenpos.y); + } + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // If early depth is enabled, write to zbuffer before depth textures + // If early depth isn't enabled, we write to the zbuffer here + int zbuffer_zCoord = bpmem_late_ztest ? zCoord : early_zCoord; + depth = float(zbuffer_zCoord) / 16777216.0; + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/129.shader_test b/shaders/dolphin/ubershaders/129.shader_test new file mode 100644 index 0000000..6f74f99 --- /dev/null +++ b/shaders/dolphin/ubershaders/129.shader_test @@ -0,0 +1,1269 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 4u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 4 texgens, early-depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +FORCE_EARLY_Z; +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/138.shader_test b/shaders/dolphin/ubershaders/138.shader_test new file mode 100644 index 0000000..88a4074 --- /dev/null +++ b/shaders/dolphin/ubershaders/138.shader_test @@ -0,0 +1,1279 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +o.tex4 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 5u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + case 4u: output_tex.xyz = o.tex4; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + case 4u: tmp = int(rawtex4.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + case 4u: o.tex4 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.tex4 = o.tex4; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 5 texgens +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + case 4u: + return tex4; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/147.shader_test b/shaders/dolphin/ubershaders/147.shader_test new file mode 100644 index 0000000..7e44656 --- /dev/null +++ b/shaders/dolphin/ubershaders/147.shader_test @@ -0,0 +1,1292 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +o.tex4 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 5u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + case 4u: output_tex.xyz = o.tex4; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + case 4u: tmp = int(rawtex4.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + case 4u: o.tex4 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.tex4 = o.tex4; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 5 texgens, per-pixel depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +#define depth gl_FragDepth +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + case 4u: + return tex4; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // ZFreeze + if ((bpmem_genmode & 524288u) != 0u) { + float2 screenpos = rawpos.xy * cefbscale.xy; + // Opengl has reversed vertical screenspace coordiantes + screenpos.y = 528.0 - screenpos.y; + zCoord = int(czslope.z + czslope.x * screenpos.x + czslope.y * screenpos.y); + } + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // If early depth is enabled, write to zbuffer before depth textures + // If early depth isn't enabled, we write to the zbuffer here + int zbuffer_zCoord = bpmem_late_ztest ? zCoord : early_zCoord; + depth = float(zbuffer_zCoord) / 16777216.0; + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/156.shader_test b/shaders/dolphin/ubershaders/156.shader_test new file mode 100644 index 0000000..f2e532e --- /dev/null +++ b/shaders/dolphin/ubershaders/156.shader_test @@ -0,0 +1,1280 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +o.tex4 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 5u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + case 4u: output_tex.xyz = o.tex4; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + case 4u: tmp = int(rawtex4.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + case 4u: o.tex4 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.tex4 = o.tex4; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 5 texgens, early-depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + case 4u: + return tex4; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +FORCE_EARLY_Z; +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/165.shader_test b/shaders/dolphin/ubershaders/165.shader_test new file mode 100644 index 0000000..560e074 --- /dev/null +++ b/shaders/dolphin/ubershaders/165.shader_test @@ -0,0 +1,1290 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +o.tex4 = float3(0.0, 0.0, 0.0); +o.tex5 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 6u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + case 4u: output_tex.xyz = o.tex4; break; + case 5u: output_tex.xyz = o.tex5; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + case 4u: tmp = int(rawtex4.z); break; + case 5u: tmp = int(rawtex5.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + case 4u: o.tex4 = output_tex; break; + case 5u: o.tex5 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.tex4 = o.tex4; + vs.tex5 = o.tex5; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 6 texgens +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + case 4u: + return tex4; + case 5u: + return tex5; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/174.shader_test b/shaders/dolphin/ubershaders/174.shader_test new file mode 100644 index 0000000..4fc32ba --- /dev/null +++ b/shaders/dolphin/ubershaders/174.shader_test @@ -0,0 +1,1303 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +o.tex4 = float3(0.0, 0.0, 0.0); +o.tex5 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 6u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + case 4u: output_tex.xyz = o.tex4; break; + case 5u: output_tex.xyz = o.tex5; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + case 4u: tmp = int(rawtex4.z); break; + case 5u: tmp = int(rawtex5.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + case 4u: o.tex4 = output_tex; break; + case 5u: o.tex5 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.tex4 = o.tex4; + vs.tex5 = o.tex5; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 6 texgens, per-pixel depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +#define depth gl_FragDepth +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + case 4u: + return tex4; + case 5u: + return tex5; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // ZFreeze + if ((bpmem_genmode & 524288u) != 0u) { + float2 screenpos = rawpos.xy * cefbscale.xy; + // Opengl has reversed vertical screenspace coordiantes + screenpos.y = 528.0 - screenpos.y; + zCoord = int(czslope.z + czslope.x * screenpos.x + czslope.y * screenpos.y); + } + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // If early depth is enabled, write to zbuffer before depth textures + // If early depth isn't enabled, we write to the zbuffer here + int zbuffer_zCoord = bpmem_late_ztest ? zCoord : early_zCoord; + depth = float(zbuffer_zCoord) / 16777216.0; + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/183.shader_test b/shaders/dolphin/ubershaders/183.shader_test new file mode 100644 index 0000000..a4a8ee6 --- /dev/null +++ b/shaders/dolphin/ubershaders/183.shader_test @@ -0,0 +1,1291 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +o.tex4 = float3(0.0, 0.0, 0.0); +o.tex5 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 6u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + case 4u: output_tex.xyz = o.tex4; break; + case 5u: output_tex.xyz = o.tex5; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + case 4u: tmp = int(rawtex4.z); break; + case 5u: tmp = int(rawtex5.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + case 4u: o.tex4 = output_tex; break; + case 5u: o.tex5 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.tex4 = o.tex4; + vs.tex5 = o.tex5; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 6 texgens, early-depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + case 4u: + return tex4; + case 5u: + return tex5; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +FORCE_EARLY_Z; +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/192.shader_test b/shaders/dolphin/ubershaders/192.shader_test new file mode 100644 index 0000000..ff28abd --- /dev/null +++ b/shaders/dolphin/ubershaders/192.shader_test @@ -0,0 +1,1301 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +o.tex4 = float3(0.0, 0.0, 0.0); +o.tex5 = float3(0.0, 0.0, 0.0); +o.tex6 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 7u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + case 4u: output_tex.xyz = o.tex4; break; + case 5u: output_tex.xyz = o.tex5; break; + case 6u: output_tex.xyz = o.tex6; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + case 4u: tmp = int(rawtex4.z); break; + case 5u: tmp = int(rawtex5.z); break; + case 6u: tmp = int(rawtex6.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + case 4u: o.tex4 = output_tex; break; + case 5u: o.tex5 = output_tex; break; + case 6u: o.tex6 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.tex4 = o.tex4; + vs.tex5 = o.tex5; + vs.tex6 = o.tex6; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 7 texgens +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + case 4u: + return tex4; + case 5u: + return tex5; + case 6u: + return tex6; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/201.shader_test b/shaders/dolphin/ubershaders/201.shader_test new file mode 100644 index 0000000..7509f2e --- /dev/null +++ b/shaders/dolphin/ubershaders/201.shader_test @@ -0,0 +1,1314 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +o.tex4 = float3(0.0, 0.0, 0.0); +o.tex5 = float3(0.0, 0.0, 0.0); +o.tex6 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 7u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + case 4u: output_tex.xyz = o.tex4; break; + case 5u: output_tex.xyz = o.tex5; break; + case 6u: output_tex.xyz = o.tex6; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + case 4u: tmp = int(rawtex4.z); break; + case 5u: tmp = int(rawtex5.z); break; + case 6u: tmp = int(rawtex6.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + case 4u: o.tex4 = output_tex; break; + case 5u: o.tex5 = output_tex; break; + case 6u: o.tex6 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.tex4 = o.tex4; + vs.tex5 = o.tex5; + vs.tex6 = o.tex6; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 7 texgens, per-pixel depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +#define depth gl_FragDepth +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + case 4u: + return tex4; + case 5u: + return tex5; + case 6u: + return tex6; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // ZFreeze + if ((bpmem_genmode & 524288u) != 0u) { + float2 screenpos = rawpos.xy * cefbscale.xy; + // Opengl has reversed vertical screenspace coordiantes + screenpos.y = 528.0 - screenpos.y; + zCoord = int(czslope.z + czslope.x * screenpos.x + czslope.y * screenpos.y); + } + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // If early depth is enabled, write to zbuffer before depth textures + // If early depth isn't enabled, we write to the zbuffer here + int zbuffer_zCoord = bpmem_late_ztest ? zCoord : early_zCoord; + depth = float(zbuffer_zCoord) / 16777216.0; + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/21.shader_test b/shaders/dolphin/ubershaders/21.shader_test new file mode 100644 index 0000000..4490850 --- /dev/null +++ b/shaders/dolphin/ubershaders/21.shader_test @@ -0,0 +1,949 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 0 texgens, early-depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +FORCE_EARLY_Z; +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/210.shader_test b/shaders/dolphin/ubershaders/210.shader_test new file mode 100644 index 0000000..1299ee0 --- /dev/null +++ b/shaders/dolphin/ubershaders/210.shader_test @@ -0,0 +1,1302 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +o.tex4 = float3(0.0, 0.0, 0.0); +o.tex5 = float3(0.0, 0.0, 0.0); +o.tex6 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 7u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + case 4u: output_tex.xyz = o.tex4; break; + case 5u: output_tex.xyz = o.tex5; break; + case 6u: output_tex.xyz = o.tex6; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + case 4u: tmp = int(rawtex4.z); break; + case 5u: tmp = int(rawtex5.z); break; + case 6u: tmp = int(rawtex6.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + case 4u: o.tex4 = output_tex; break; + case 5u: o.tex5 = output_tex; break; + case 6u: o.tex6 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.tex4 = o.tex4; + vs.tex5 = o.tex5; + vs.tex6 = o.tex6; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 7 texgens, early-depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + case 4u: + return tex4; + case 5u: + return tex5; + case 6u: + return tex6; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +FORCE_EARLY_Z; +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/219.shader_test b/shaders/dolphin/ubershaders/219.shader_test new file mode 100644 index 0000000..0ae96ed --- /dev/null +++ b/shaders/dolphin/ubershaders/219.shader_test @@ -0,0 +1,1312 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float3 tex7; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float3 tex7; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +o.tex4 = float3(0.0, 0.0, 0.0); +o.tex5 = float3(0.0, 0.0, 0.0); +o.tex6 = float3(0.0, 0.0, 0.0); +o.tex7 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 8u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + case 4u: output_tex.xyz = o.tex4; break; + case 5u: output_tex.xyz = o.tex5; break; + case 6u: output_tex.xyz = o.tex6; break; + case 7u: output_tex.xyz = o.tex7; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + case 4u: tmp = int(rawtex4.z); break; + case 5u: tmp = int(rawtex5.z); break; + case 6u: tmp = int(rawtex6.z); break; + case 7u: tmp = int(rawtex7.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + case 4u: o.tex4 = output_tex; break; + case 5u: o.tex5 = output_tex; break; + case 6u: o.tex6 = output_tex; break; + case 7u: o.tex7 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.tex4 = o.tex4; + vs.tex5 = o.tex5; + vs.tex6 = o.tex6; + vs.tex7 = o.tex7; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 8 texgens +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float3 tex7; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float3 tex7; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + case 4u: + return tex4; + case 5u: + return tex5; + case 6u: + return tex6; + case 7u: + return tex7; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/228.shader_test b/shaders/dolphin/ubershaders/228.shader_test new file mode 100644 index 0000000..b87278e --- /dev/null +++ b/shaders/dolphin/ubershaders/228.shader_test @@ -0,0 +1,1325 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float3 tex7; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float3 tex7; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +o.tex4 = float3(0.0, 0.0, 0.0); +o.tex5 = float3(0.0, 0.0, 0.0); +o.tex6 = float3(0.0, 0.0, 0.0); +o.tex7 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 8u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + case 4u: output_tex.xyz = o.tex4; break; + case 5u: output_tex.xyz = o.tex5; break; + case 6u: output_tex.xyz = o.tex6; break; + case 7u: output_tex.xyz = o.tex7; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + case 4u: tmp = int(rawtex4.z); break; + case 5u: tmp = int(rawtex5.z); break; + case 6u: tmp = int(rawtex6.z); break; + case 7u: tmp = int(rawtex7.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + case 4u: o.tex4 = output_tex; break; + case 5u: o.tex5 = output_tex; break; + case 6u: o.tex6 = output_tex; break; + case 7u: o.tex7 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.tex4 = o.tex4; + vs.tex5 = o.tex5; + vs.tex6 = o.tex6; + vs.tex7 = o.tex7; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 8 texgens, per-pixel depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float3 tex7; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +#define depth gl_FragDepth +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float3 tex7; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + case 4u: + return tex4; + case 5u: + return tex5; + case 6u: + return tex6; + case 7u: + return tex7; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // ZFreeze + if ((bpmem_genmode & 524288u) != 0u) { + float2 screenpos = rawpos.xy * cefbscale.xy; + // Opengl has reversed vertical screenspace coordiantes + screenpos.y = 528.0 - screenpos.y; + zCoord = int(czslope.z + czslope.x * screenpos.x + czslope.y * screenpos.y); + } + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // If early depth is enabled, write to zbuffer before depth textures + // If early depth isn't enabled, we write to the zbuffer here + int zbuffer_zCoord = bpmem_late_ztest ? zCoord : early_zCoord; + depth = float(zbuffer_zCoord) / 16777216.0; + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/237.shader_test b/shaders/dolphin/ubershaders/237.shader_test new file mode 100644 index 0000000..78c9356 --- /dev/null +++ b/shaders/dolphin/ubershaders/237.shader_test @@ -0,0 +1,1313 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float3 tex7; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float3 tex7; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +o.tex3 = float3(0.0, 0.0, 0.0); +o.tex4 = float3(0.0, 0.0, 0.0); +o.tex5 = float3(0.0, 0.0, 0.0); +o.tex6 = float3(0.0, 0.0, 0.0); +o.tex7 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 8u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + case 3u: output_tex.xyz = o.tex3; break; + case 4u: output_tex.xyz = o.tex4; break; + case 5u: output_tex.xyz = o.tex5; break; + case 6u: output_tex.xyz = o.tex6; break; + case 7u: output_tex.xyz = o.tex7; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + case 3u: tmp = int(rawtex3.z); break; + case 4u: tmp = int(rawtex4.z); break; + case 5u: tmp = int(rawtex5.z); break; + case 6u: tmp = int(rawtex6.z); break; + case 7u: tmp = int(rawtex7.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + case 3u: o.tex3 = output_tex; break; + case 4u: o.tex4 = output_tex; break; + case 5u: o.tex5 = output_tex; break; + case 6u: o.tex6 = output_tex; break; + case 7u: o.tex7 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.tex3 = o.tex3; + vs.tex4 = o.tex4; + vs.tex5 = o.tex5; + vs.tex6 = o.tex6; + vs.tex7 = o.tex7; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 8 texgens, early-depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float3 tex7; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float3 tex3; + float3 tex4; + float3 tex5; + float3 tex6; + float3 tex7; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + case 3u: + return tex3; + case 4u: + return tex4; + case 5u: + return tex5; + case 6u: + return tex6; + case 7u: + return tex7; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +FORCE_EARLY_Z; +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/3.shader_test b/shaders/dolphin/ubershaders/3.shader_test new file mode 100644 index 0000000..f3256f8 --- /dev/null +++ b/shaders/dolphin/ubershaders/3.shader_test @@ -0,0 +1,948 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 0 texgens +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/30.shader_test b/shaders/dolphin/ubershaders/30.shader_test new file mode 100644 index 0000000..ddbc48a --- /dev/null +++ b/shaders/dolphin/ubershaders/30.shader_test @@ -0,0 +1,1235 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +{ const uint texgen = 0u; + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 1 texgens +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/39.shader_test b/shaders/dolphin/ubershaders/39.shader_test new file mode 100644 index 0000000..19b90c0 --- /dev/null +++ b/shaders/dolphin/ubershaders/39.shader_test @@ -0,0 +1,1248 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +{ const uint texgen = 0u; + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 1 texgens, per-pixel depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +#define depth gl_FragDepth +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // ZFreeze + if ((bpmem_genmode & 524288u) != 0u) { + float2 screenpos = rawpos.xy * cefbscale.xy; + // Opengl has reversed vertical screenspace coordiantes + screenpos.y = 528.0 - screenpos.y; + zCoord = int(czslope.z + czslope.x * screenpos.x + czslope.y * screenpos.y); + } + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // If early depth is enabled, write to zbuffer before depth textures + // If early depth isn't enabled, we write to the zbuffer here + int zbuffer_zCoord = bpmem_late_ztest ? zCoord : early_zCoord; + depth = float(zbuffer_zCoord) / 16777216.0; + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/48.shader_test b/shaders/dolphin/ubershaders/48.shader_test new file mode 100644 index 0000000..8e27f9f --- /dev/null +++ b/shaders/dolphin/ubershaders/48.shader_test @@ -0,0 +1,1236 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +{ const uint texgen = 0u; + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 1 texgens, early-depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +FORCE_EARLY_Z; +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/57.shader_test b/shaders/dolphin/ubershaders/57.shader_test new file mode 100644 index 0000000..7372be8 --- /dev/null +++ b/shaders/dolphin/ubershaders/57.shader_test @@ -0,0 +1,1246 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 2u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 2 texgens +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/66.shader_test b/shaders/dolphin/ubershaders/66.shader_test new file mode 100644 index 0000000..098f3ec --- /dev/null +++ b/shaders/dolphin/ubershaders/66.shader_test @@ -0,0 +1,1259 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 2u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 2 texgens, per-pixel depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +#define depth gl_FragDepth +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // ZFreeze + if ((bpmem_genmode & 524288u) != 0u) { + float2 screenpos = rawpos.xy * cefbscale.xy; + // Opengl has reversed vertical screenspace coordiantes + screenpos.y = 528.0 - screenpos.y; + zCoord = int(czslope.z + czslope.x * screenpos.x + czslope.y * screenpos.y); + } + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // If early depth is enabled, write to zbuffer before depth textures + // If early depth isn't enabled, we write to the zbuffer here + int zbuffer_zCoord = bpmem_late_ztest ? zCoord : early_zCoord; + depth = float(zbuffer_zCoord) / 16777216.0; + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/75.shader_test b/shaders/dolphin/ubershaders/75.shader_test new file mode 100644 index 0000000..db64b36 --- /dev/null +++ b/shaders/dolphin/ubershaders/75.shader_test @@ -0,0 +1,1247 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 2u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 2 texgens, early-depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +FORCE_EARLY_Z; +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/84.shader_test b/shaders/dolphin/ubershaders/84.shader_test new file mode 100644 index 0000000..2c4511c --- /dev/null +++ b/shaders/dolphin/ubershaders/84.shader_test @@ -0,0 +1,1257 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 3u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 3 texgens +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + diff --git a/shaders/dolphin/ubershaders/93.shader_test b/shaders/dolphin/ubershaders/93.shader_test new file mode 100644 index 0000000..42c01d2 --- /dev/null +++ b/shaders/dolphin/ubershaders/93.shader_test @@ -0,0 +1,1270 @@ +[require] +GLSL >= 4.00 + +[vertex shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Vertex UberShader + +struct Light { + int4 color; + float4 cosatt; + float4 distatt; + float4 pos; + float4 dir; +}; +UBO_BINDING(std140, 2) uniform VSBlock { + uint components; + uint xfmem_dualTexInfo; + uint xfmem_numColorChans; + float4 cpnmtx[6]; + float4 cproj[4]; + int4 cmtrl[4]; + Light clights[8]; + float4 ctexmtx[24]; + float4 ctrmtx[64]; + float4 cnmtx[32]; + float4 cpostmtx[64]; + float4 cpixelcenter; + float2 cviewport; + uint4 xfmem_pack1[8]; + #define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x) + #define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y) + #define xfmem_color(i) (xfmem_pack1[(i)].z) + #define xfmem_alpha(i) (xfmem_pack1[(i)].w) +}; +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, float3 normal) { + float3 ldir, h, cosAttn, distAttn; + float dist, dist2, attn; + + switch (attnfunc) { + case 0u: // LIGNTATTN_NONE + case 2u: // LIGHTATTN_DIR + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = 1.0; + if (length(ldir) == 0.0) + ldir = normal; + break; + + case 1u: // LIGHTATTN_SPEC + ldir = normalize(clights[index].pos.xyz - pos.xyz); + attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, clights[index].dir.xyz)) : 0.0; + cosAttn = clights[index].cosatt.xyz; + if (diffusefunc == 0u) // LIGHTDIF_NONE + distAttn = clights[index].distatt.xyz; + else + distAttn = normalize(clights[index].distatt.xyz); + attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, float3(1.0, attn, attn*attn)); + break; + + case 3u: // LIGHTATTN_SPOT + ldir = clights[index].pos.xyz - pos.xyz; + dist2 = dot(ldir, ldir); + dist = sqrt(dist2); + ldir = ldir / dist; + attn = max(0.0, dot(ldir, clights[index].dir.xyz)); + attn = max(0.0, clights[index].cosatt.x + clights[index].cosatt.y * attn + clights[index].cosatt.z * attn * attn) / dot(clights[index].distatt.xyz, float3(1.0, dist, dist2)); + break; + + default: + attn = 1.0; + ldir = normal; + break; + } + + switch (diffusefunc) { + case 0u: // LIGHTDIF_NONE + return int4(round(attn * float4(clights[index].color))); + + case 1u: // LIGHTDIF_SIGN + return int4(round(attn * dot(ldir, normal) * float4(clights[index].color))); + + case 2u: // LIGHTDIF_CLAMP + return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(clights[index].color))); + + default: + return int4(0, 0, 0, 0); + } +} + +ATTRIBUTE_LOCATION(0) in float4 rawpos; +ATTRIBUTE_LOCATION(1) in uint4 posmtx; +ATTRIBUTE_LOCATION(2) in float3 rawnorm0; +ATTRIBUTE_LOCATION(3) in float3 rawnorm1; +ATTRIBUTE_LOCATION(4) in float3 rawnorm2; +ATTRIBUTE_LOCATION(5) in float4 rawcolor0; +ATTRIBUTE_LOCATION(6) in float4 rawcolor1; +ATTRIBUTE_LOCATION(8) in float3 rawtex0; +ATTRIBUTE_LOCATION(9) in float3 rawtex1; +ATTRIBUTE_LOCATION(10) in float3 rawtex2; +ATTRIBUTE_LOCATION(11) in float3 rawtex3; +ATTRIBUTE_LOCATION(12) in float3 rawtex4; +ATTRIBUTE_LOCATION(13) in float3 rawtex5; +ATTRIBUTE_LOCATION(14) in float3 rawtex6; +ATTRIBUTE_LOCATION(15) in float3 rawtex7; +VARYING_LOCATION(0) out VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float4 clipPos; + float clipDist0; + float clipDist1; +} vs; +void main() +{ +VS_OUTPUT o; + +// Position matrix +float4 P0; +float4 P1; +float4 P2; + +// Normal matrix +float3 N0; +float3 N1; +float3 N2; + +if ((components & 2u) != 0u) {// VB_HAS_POSMTXIDX + // Vertex format has a per-vertex matrix + int posidx = int(posmtx.r); + P0 = ctrmtx[posidx]; + P1 = ctrmtx[posidx+1]; + P2 = ctrmtx[posidx+2]; + + int normidx = posidx >= 32 ? (posidx - 32) : posidx; + N0 = cnmtx[normidx].xyz; + N1 = cnmtx[normidx+1].xyz; + N2 = cnmtx[normidx+2].xyz; +} else { + // One shared matrix + P0 = cpnmtx[0]; + P1 = cpnmtx[1]; + P2 = cpnmtx[2]; + N0 = cpnmtx[3].xyz; + N1 = cpnmtx[4].xyz; + N2 = cpnmtx[5].xyz; +} + +float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0); +o.pos = float4(dot(cproj[0], pos), dot(cproj[1], pos), dot(cproj[2], pos), dot(cproj[3], pos)); + +// Only the first normal gets normalized (TODO: why?) +float3 _norm0 = float3(0.0, 0.0, 0.0); +if ((components & 1024u) != 0u) // VB_HAS_NRM0 + _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0))); + +float3 _norm1 = float3(0.0, 0.0, 0.0); +if ((components & 2048u) != 0u) // VB_HAS_NRM1 + _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1)); + +float3 _norm2 = float3(0.0, 0.0, 0.0); +if ((components & 4096u) != 0u) // VB_HAS_NRM2 + _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2)); + +// Lighting +for (uint chan = 0u; chan < xfmem_numColorChans; chan++) { + uint colorreg = xfmem_color(chan); + uint alphareg = xfmem_alpha(chan); + int4 mat = cmtrl[chan + 2u]; + int4 lacc = int4(255, 255, 255, 255); + + if (bitfieldExtract(colorreg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + mat.xyz = int3(255, 255, 255); + } + + if (bitfieldExtract(alphareg, 0, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + mat.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + mat.w = int(round(rawcolor0.w * 255.0)); + else + mat.w = 255; + } else { + mat.w = cmtrl [chan + 2u].w; + } + + if (bitfieldExtract(colorreg, 1, 1) != 0u) { + if (bitfieldExtract(colorreg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.xyz = int3(round(((chan == 0u) ? rawcolor0.xyz : rawcolor1.xyz) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.xyz = int3(round(rawcolor0.xyz * 255.0)); + else + lacc.xyz = int3(255, 255, 255); + } else { + lacc.xyz = cmtrl [chan].xyz; + } + + uint light_mask = bitfieldExtract(colorreg, 2, 4) | (bitfieldExtract(colorreg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(colorreg, 9, 2); + uint diffusefunc = bitfieldExtract(colorreg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + if ((light_mask & (1u << light_index)) != 0u) + lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).xyz; + } + } + + if (bitfieldExtract(alphareg, 1, 1) != 0u) { + if (bitfieldExtract(alphareg, 6, 1) != 0u) { + if ((components & (8192u << chan)) != 0u) // VB_HAS_COL0 + lacc.w = int(round(((chan == 0u) ? rawcolor0.w : rawcolor1.w) * 255.0)); + else if ((components & 8192u) != 0u) // VB_HAS_COLO0 + lacc.w = int(round(rawcolor0.w * 255.0)); + else + lacc.w = 255; + } else { + lacc.w = cmtrl [chan].w; + } + + uint light_mask = bitfieldExtract(alphareg, 2, 4) | (bitfieldExtract(alphareg, 11, 4) << 4u); + uint attnfunc = bitfieldExtract(alphareg, 9, 2); + uint diffusefunc = bitfieldExtract(alphareg, 7, 2); + for (uint light_index = 0u; light_index < 8u; light_index++) { + + if ((light_mask & (1u << light_index)) != 0u) + + lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos.xyz, _norm0).w; + } + } + + lacc = clamp(lacc, 0, 255); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0; + switch (chan) { + case 0u: o.colors_0 = lit_color; break; + case 1u: o.colors_1 = lit_color; break; + } +} + +if (xfmem_numColorChans < 2u && (components & 16384u) == 0u) + o.colors_1 = o.colors_0; + +o.tex0 = float3(0.0, 0.0, 0.0); +o.tex1 = float3(0.0, 0.0, 0.0); +o.tex2 = float3(0.0, 0.0, 0.0); +// Texture coordinate generation +for (uint texgen = 0u; texgen < 3u; texgen++) { + // Texcoord transforms + float4 coord = float4(0.0, 0.0, 1.0, 1.0); + uint texMtxInfo = xfmem_texMtxInfo(texgen); + switch (bitfieldExtract(texMtxInfo, 7, 5)) { + case 0u: // XF_SRCGEOM_INROW + coord.xyz = rawpos.xyz; + break; + + case 1u: // XF_SRCNORMAL_INROW + coord.xyz = ((components & 1024u /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz; break; + + case 3u: // XF_SRCBINORMAL_T_INROW + coord.xyz = ((components & 2048u /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz; break; + + case 4u: // XF_SRCBINORMAL_B_INROW + coord.xyz = ((components & 4096u /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz; break; + + case 5u: // XF_SRCTEX0_INROW + coord = ((components & 32768u /* VB_HAS_UV0 */) != 0u) ? float4(rawtex0.x, rawtex0.y, 1.0, 1.0) : coord; + break; + + case 6u: // XF_SRCTEX1_INROW + coord = ((components & 65536u /* VB_HAS_UV1 */) != 0u) ? float4(rawtex1.x, rawtex1.y, 1.0, 1.0) : coord; + break; + + case 7u: // XF_SRCTEX2_INROW + coord = ((components & 131072u /* VB_HAS_UV2 */) != 0u) ? float4(rawtex2.x, rawtex2.y, 1.0, 1.0) : coord; + break; + + case 8u: // XF_SRCTEX3_INROW + coord = ((components & 262144u /* VB_HAS_UV3 */) != 0u) ? float4(rawtex3.x, rawtex3.y, 1.0, 1.0) : coord; + break; + + case 9u: // XF_SRCTEX4_INROW + coord = ((components & 524288u /* VB_HAS_UV4 */) != 0u) ? float4(rawtex4.x, rawtex4.y, 1.0, 1.0) : coord; + break; + + case 10u: // XF_SRCTEX5_INROW + coord = ((components & 1048576u /* VB_HAS_UV5 */) != 0u) ? float4(rawtex5.x, rawtex5.y, 1.0, 1.0) : coord; + break; + + case 11u: // XF_SRCTEX6_INROW + coord = ((components & 2097152u /* VB_HAS_UV6 */) != 0u) ? float4(rawtex6.x, rawtex6.y, 1.0, 1.0) : coord; + break; + + case 12u: // XF_SRCTEX7_INROW + coord = ((components & 4194304u /* VB_HAS_UV7 */) != 0u) ? float4(rawtex7.x, rawtex7.y, 1.0, 1.0) : coord; + break; + + } + + // Input form of AB11 sets z element to 1.0 + if (bitfieldExtract(texMtxInfo, 2, 1) == 0u) // inputform == XF_TEXINPUT_AB11 + coord.z = 1.0f; + + // first transformation + uint texgentype = bitfieldExtract(texMtxInfo, 4, 3); + float3 output_tex; + switch (texgentype) + { + case 1u: // XF_TEXGEN_EMBOSS_MAP + { + uint light = bitfieldExtract(texMtxInfo, 15, 3); + uint source = bitfieldExtract(texMtxInfo, 12, 3); + switch (source) { + case 0u: output_tex.xyz = o.tex0; break; + case 1u: output_tex.xyz = o.tex1; break; + case 2u: output_tex.xyz = o.tex2; break; + default: output_tex.xyz = float3(0.0, 0.0, 0.0); break; + } + if ((components & 6144u) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2 + float3 ldir = normalize(clights[light].pos.xyz - pos.xyz); + output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0); + } + } + break; + + case 2u: // XF_TEXGEN_COLOR_STRGBC0 + output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0); + break; + + case 3u: // XF_TEXGEN_COLOR_STRGBC1 + output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0); + break; + + default: // Also XF_TEXGEN_REGULAR + { + if ((components & (4u /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) { + // This is messy, due to dynamic indexing of the input texture coordinates. + // Hopefully the compiler will unroll this whole loop anyway and the switch. + int tmp = 0; + switch (texgen) { + case 0u: tmp = int(rawtex0.z); break; + case 1u: tmp = int(rawtex1.z); break; + case 2u: tmp = int(rawtex2.z); break; + } + + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + dot(coord, ctrmtx[tmp + 2])); + } else { + output_tex.xyz = float3(dot(coord, ctrmtx[tmp]), + dot(coord, ctrmtx[tmp + 1]), + 1.0); + } + } else { + if (bitfieldExtract(texMtxInfo, 1, 1) == 1u) { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + dot(coord, ctexmtx[3u * texgen + 2u])); + } else { + output_tex.xyz = float3(dot(coord, ctexmtx[3u * texgen]), + dot(coord, ctexmtx[3u * texgen + 1u]), + 1.0); + } + } + } + break; + + } + + if (xfmem_dualTexInfo != 0u) { + uint postMtxInfo = xfmem_postMtxInfo(texgen); uint base_index = bitfieldExtract(postMtxInfo, 0, 6); + float4 P0 = cpostmtx[base_index & 0x3fu]; + float4 P1 = cpostmtx[(base_index + 1u) & 0x3fu]; + float4 P2 = cpostmtx[(base_index + 2u) & 0x3fu]; + + if (bitfieldExtract(postMtxInfo, 8, 1) != 0u) + output_tex.xyz = normalize(output_tex.xyz); + + // multiply by postmatrix + output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w, + dot(P1.xyz, output_tex.xyz) + P1.w, + dot(P2.xyz, output_tex.xyz) + P2.w); + } + + if (texgentype == 0u && output_tex.z == 0.0) // XF_TEXGEN_REGULAR + output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f)); + + // Hopefully GPUs that can support dynamic indexing will optimize this. + switch (texgen) { + case 0u: o.tex0 = output_tex; break; + case 1u: o.tex1 = output_tex; break; + case 2u: o.tex2 = output_tex; break; + } +} +o.clipPos = o.pos; +float clipDepth = o.pos.z * (1.0 - 1e-7); +o.clipDist0 = clipDepth + o.pos.w; +o.clipDist1 = -clipDepth; +o.pos.z = o.pos.w * cpixelcenter.w - o.pos.z * cpixelcenter.z; +o.pos.xy *= sign(cpixelcenter.xy * float2(1.0, -1.0)); +o.pos.xy = o.pos.xy - o.pos.w * cpixelcenter.xy; + vs.pos = o.pos; + vs.colors_0 = o.colors_0; + vs.colors_1 = o.colors_1; + vs.tex0 = o.tex0; + vs.tex1 = o.tex1; + vs.tex2 = o.tex2; + vs.clipPos = o.clipPos; + vs.clipDist0 = o.clipDist0; + vs.clipDist1 = o.clipDist1; +gl_ClipDistance[0] = o.clipDist0; +gl_ClipDistance[1] = o.clipDist1; +gl_Position = o.pos; +} + +[fragment shader] +#version 400 + +#define FORCE_EARLY_Z layout(early_fragment_tests) in + +#extension GL_ARB_shading_language_420pack : enable + +#define ATTRIBUTE_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION(x) +#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y) +#define UBO_BINDING(packing, x) layout(packing, binding = x) +#define SAMPLER_BINDING(x) layout(binding = x) +#define SSBO_BINDING(x) layout(binding = x) + +#define VARYING_LOCATION(x) + +#extension GL_ARB_shader_storage_buffer_object : enable + + + + + + + +#extension GL_ARB_shader_image_load_store : enable + + + + + + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define frac fract +#define lerp mix +// Pixel UberShader for 3 texgens, per-pixel depth +int idot(int3 x, int3 y) +{ + int3 tmp = x * y; + return tmp.x + tmp.y + tmp.z; +} +int idot(int4 x, int4 y) +{ + int4 tmp = x * y; + return tmp.x + tmp.y + tmp.z + tmp.w; +} + +int iround(float x) { return int (round(x)); } +int2 iround(float2 x) { return int2(round(x)); } +int3 iround(float3 x) { return int3(round(x)); } +int4 iround(float4 x) { return int4(round(x)); } + +SAMPLER_BINDING(0) uniform sampler2DArray samp[8]; + +UBO_BINDING(std140, 1) uniform PSBlock { + int4 color[4]; + int4 k[4]; + int4 alphaRef; + float4 texdim[8]; + int4 czbias[2]; + int4 cindscale[2]; + int4 cindmtx[6]; + int4 cfogcolor; + int4 cfogi; + float4 cfogf[2]; + float4 czslope; + float2 cefbscale; + uint bpmem_genmode; + uint bpmem_alphaTest; + uint bpmem_fogParam3; + uint bpmem_fogRangeBase; + uint bpmem_dstalpha; + uint bpmem_ztex_op; + bool bpmem_late_ztest; + bool bpmem_rgba6_format; + bool bpmem_dither; + bool bpmem_bounding_box; + uint4 bpmem_pack1[16]; + uint4 bpmem_pack2[8]; + int4 konstLookup[32]; +}; + +#define bpmem_combiners(i) (bpmem_pack1[(i)].xy) +#define bpmem_tevind(i) (bpmem_pack1[(i)].z) +#define bpmem_iref(i) (bpmem_pack1[(i)].w) +#define bpmem_tevorder(i) (bpmem_pack2[(i)].x) +#define bpmem_tevksel(i) (bpmem_pack2[(i)].y) + +struct VS_OUTPUT { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float4 clipPos; + float clipDist0; + float clipDist1; +}; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0; +FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1; +#define depth gl_FragDepth +VARYING_LOCATION(0) in VertexData { + float4 pos; + float4 colors_0; + float4 colors_1; + float3 tex0; + float3 tex1; + float3 tex2; + float4 clipPos; + float clipDist0; + float clipDist1; +}; + +float3 selectTexCoord(uint index) { + switch (index) { + case 0u: + return tex0; + case 1u: + return tex1; + case 2u: + return tex2; + default: + return float3(0.0, 0.0, 0.0); + } +} + +int4 sampleTexture(uint sampler_num, float2 uv) { + return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0); +} + +int4 Swizzle(uint s, int4 color) { + // AKA: Color Channel Swapping + + int4 ret; + ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)]; + ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)]; + ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)]; + ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)]; + return ret; +} + +int Wrap(int coord, uint mode) { + if (mode == 0u) // ITW_OFF + return coord; + else if (mode < 6u) // ITW_256 to ITW_16 + return coord & (0xfffe >> mode); + else // ITW_0 + return 0; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// TEV's Linear Interpolate, plus bias, add/subtract and scale +int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) { + // Scale C from 0..255 to 0..256 + C += C >> 7; + + // Add bias to D + if (bias == 1u) D += 128; + else if (bias == 2u) D -= 128; + + int3 lerp = (A << 8) + (B - A)*C; + if (shift != 3u) { + lerp = lerp << shift; + D = D << shift; + } + + if ((shift == 3u) == alpha) + lerp = lerp + (op ? 127 : 128); + + int3 result = lerp >> 8; + + // Add/Subtract D + if(op) // Subtract + result = D - result; + else // Add + result = D + result; + + // Most of the Shift was moved inside the lerp for improved percision + // But we still do the divide by 2 here + if (shift == 3u) + result = result >> 1; + return result; +} + +// Implements operations 0-5 of tev's compare mode, +// which are common to both color and alpha channels +bool tevCompare(uint op, int3 color_A, int3 color_B) { + switch (op) { + case 0u: // TEVCMP_R8_GT + return (color_A.r > color_B.r); + case 1u: // TEVCMP_R8_EQ + return (color_A.r == color_B.r); + case 2u: // TEVCMP_GR16_GT + int A_16 = (color_A.r | (color_A.g << 8)); + int B_16 = (color_B.r | (color_B.g << 8)); + return A_16 > B_16; + case 3u: // TEVCMP_GR16_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g); + case 4u: // TEVCMP_BGR24_GT + int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16)); + int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16)); + return A_24 > B_24; + case 5u: // TEVCMP_BGR24_EQ + return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b); + default: + return false; + } +} + +// Helper function for Alpha Test +bool alphaCompare(int a, int b, uint compare) { + switch (compare) { + case 0u: // NEVER + return false; + case 1u: // LESS + return a < b; + case 2u: // EQUAL + return a == b; + case 3u: // LEQUAL + return a <= b; + case 4u: // GREATER + return a > b; + case 5u: // NEQUAL; + return a != b; + case 6u: // GEQUAL + return a >= b; + case 7u: // ALWAYS + return true; + } +} + +struct State { + int4 Reg[4]; + int4 TexColor; + int AlphaBump; +}; +struct StageState { + uint stage; + uint order; + uint cc; + uint ac; +}; + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1); +int4 getKonstColor(State s, StageState ss); + +int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.rgb + return s.Reg[0].rgb; + case 1u: // prev.aaa + return s.Reg[0].aaa; + case 2u: // c0.rgb + return s.Reg[1].rgb; + case 3u: // c0.aaa + return s.Reg[1].aaa; + case 4u: // c1.rgb + return s.Reg[2].rgb; + case 5u: // c1.aaa + return s.Reg[2].aaa; + case 6u: // c2.rgb + return s.Reg[3].rgb; + case 7u: // c2.aaa + return s.Reg[3].aaa; + case 8u: + return s.TexColor.rgb; + case 9u: + return s.TexColor.aaa; + case 10u: + return getRasColor(s, ss, colors_0, colors_1).rgb; + case 11u: + return getRasColor(s, ss, colors_0, colors_1).aaa; + case 12u: // One + return int3(255, 255, 255); + case 13u: // Half + return int3(128, 128, 128); + case 14u: + return getKonstColor(s, ss).rgb; + case 15u: // Zero + return int3(0, 0, 0); + } +} + +int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) { + switch (index) { + case 0u: // prev.a + return s.Reg[0].a; + case 1u: // c0.a + return s.Reg[1].a; + case 2u: // c1.a + return s.Reg[2].a; + case 3u: // c2.a + return s.Reg[3].a; + case 4u: + return s.TexColor.a; + case 5u: + return getRasColor(s, ss, colors_0, colors_1).a; + case 6u: + return getKonstColor(s, ss).a; + case 7u: // Zero + return 0; + } +} + +int4 getTevReg(in State s, uint index) { + switch (index) { + case 0u: // prev + return s.Reg[0]; + case 1u: // c0 + return s.Reg[1]; + case 2u: // c1 + return s.Reg[2]; + case 3u: // c2 + return s.Reg[3]; + default: // prev + return s.Reg[0]; + } +} + +void setRegColor(inout State s, uint index, int3 color) { + switch (index) { + case 0u: // prev + s.Reg[0].rgb = color; + break; + case 1u: // c0 + s.Reg[1].rgb = color; + break; + case 2u: // c1 + s.Reg[2].rgb = color; + break; + case 3u: // c2 + s.Reg[3].rgb = color; + break; + } +} + +void setRegAlpha(inout State s, uint index, int alpha) { + switch (index) { + case 0u: // prev + s.Reg[0].a = alpha; + break; + case 1u: // c0 + s.Reg[1].a = alpha; + break; + case 2u: // c1 + s.Reg[2].a = alpha; + break; + case 3u: // c2 + s.Reg[3].a = alpha; + break; + } +} + +#define getTexCoord(index) selectTexCoord((index)) + +void main() +{ + float4 rawpos = gl_FragCoord; + int3 tevcoord = int3(0, 0, 0); + State s; + s.TexColor = int4(0, 0, 0, 0); + s.AlphaBump = 0; + + s.Reg[0] = color[0]; + s.Reg[1] = color[1]; + s.Reg[2] = color[2]; + s.Reg[3] = color[3]; + uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4); + + // Main tev loop + for(uint stage = 0u; stage <= num_stages; stage++) + { + StageState ss; + ss.stage = stage; + ss.cc = bpmem_combiners(stage).x; + ss.ac = bpmem_combiners(stage).y; + ss.order = bpmem_tevorder(stage>>1); + if ((stage & 1u) == 1u) + ss.order = ss.order >> 12; + + uint tex_coord = bitfieldExtract(ss.order, 3, 3); + float3 uv = getTexCoord(tex_coord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[tex_coord].zw); + + bool texture_enabled = (ss.order & 64u) != 0u; + + // Indirect textures + uint tevind = bpmem_tevind(stage); + if (tevind != 0u) + { + uint bs = bitfieldExtract(tevind, 7, 2); + uint fmt = bitfieldExtract(tevind, 2, 2); + uint bias = bitfieldExtract(tevind, 4, 3); + uint bt = bitfieldExtract(tevind, 0, 2); + uint mid = bitfieldExtract(tevind, 9, 4); + + int3 indcoord; +{ + uint iref = bpmem_iref(bt); + if ( iref != 0u) + { + uint texcoord = bitfieldExtract(iref, 0, 3); + uint texmap = bitfieldExtract(iref, 8, 3); + float3 uv = getTexCoord(texcoord); + int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * texdim[texcoord].zw); + + if ((bt & 1u) == 0u) + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].xy; + else + fixedPoint_uv = fixedPoint_uv >> cindscale[bt >> 1].zw; + + indcoord = sampleTexture(texmap, float2(fixedPoint_uv) * texdim[texmap].xy).abg; + } + else + { + indcoord = int3(0, 0, 0); + } +} + if (bs != 0u) + s.AlphaBump = indcoord[bs - 1u]; + switch(fmt) + { + case 0u: + indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0); + indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0); + indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + case 1u: + indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xe0; + break; + case 2u: + indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf0; + break; + case 3u: + indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0); + indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0); + indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0); + s.AlphaBump = s.AlphaBump & 0xf8; + break; + } + + // Matrix multiply + int2 indtevtrans = int2(0, 0); + if ((mid & 3u) != 0u) + { + uint mtxidx = 2u * ((mid & 3u) - 1u); + int shift = cindmtx[mtxidx].w; + + switch (mid >> 2) + { + case 0u: // 3x2 S0.10 matrix + indtevtrans = int2(idot(cindmtx[mtxidx].xyz, indcoord), idot(cindmtx[mtxidx + 1u].xyz, indcoord)) >> 3; + break; + case 1u: // S matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8; + break; + case 2u: // T matrix, S17.7 format + indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8; + break; + } + + if (shift >= 0) + indtevtrans = indtevtrans >> shift; + else + indtevtrans = indtevtrans << ((-shift) & 31); + } + + // Wrapping + uint sw = bitfieldExtract(tevind, 13, 3); + uint tw = bitfieldExtract(tevind, 16, 3); + int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw)); + + if ((tevind & 1048576u) != 0u) // add previous tevcoord + tevcoord.xy += wrapped_coord + indtevtrans; + else + tevcoord.xy = wrapped_coord + indtevtrans; + + // Emulate s24 overflows + tevcoord.xy = (tevcoord.xy << 8) >> 8; + } + else if (texture_enabled) + { + tevcoord.xy = fixedPoint_uv; + } + + // Sample texture for stage + if(texture_enabled) { + uint sampler_num = bitfieldExtract(ss.order, 0, 3); + + float2 uv = (float2(tevcoord.xy)) * texdim[sampler_num].xy; + + int4 color = sampleTexture(sampler_num, uv); + + uint swap = bitfieldExtract(ss.ac, 2, 2); + s.TexColor = Swizzle(swap, color); + } else { + // Texture is disabled + s.TexColor = int4(255, 255, 255, 255); + } + + // This is the Meat of TEV + { + // Color Combiner + uint color_a = bitfieldExtract(ss.cc, 12, 4); + uint color_b = bitfieldExtract(ss.cc, 8, 4); + uint color_c = bitfieldExtract(ss.cc, 4, 4); + uint color_d = bitfieldExtract(ss.cc, 0, 4); + uint color_bias = bitfieldExtract(ss.cc, 16, 2); + bool color_op = bool(bitfieldExtract(ss.cc, 18, 1)); + bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1)); + uint color_shift = bitfieldExtract(ss.cc, 20, 2); + uint color_dest = bitfieldExtract(ss.cc, 22, 2); + uint color_compare_op = color_shift << 1 | uint(color_op); + + int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255); + int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255); + int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255); + int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign + + int3 color; + if(color_bias != 3u) { // Normal mode + color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift); + } else { // Compare mode + // op 6 and 7 do a select per color channel + if (color_compare_op == 6u) { + // TEVCMP_RGB8_GT + color.r = (color_A.r > color_B.r) ? color_C.r : 0; + color.g = (color_A.g > color_B.g) ? color_C.g : 0; + color.b = (color_A.b > color_B.b) ? color_C.b : 0; + } else if (color_compare_op == 7u) { + // TEVCMP_RGB8_EQ + color.r = (color_A.r == color_B.r) ? color_C.r : 0; + color.g = (color_A.g == color_B.g) ? color_C.g : 0; + color.b = (color_A.b == color_B.b) ? color_C.b : 0; + } else { + // The remaining ops do one compare which selects all 3 channels + color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0); + } + color = color_D + color; + } + + // Clamp result + if (color_clamp) + color = clamp(color, 0, 255); + else + color = clamp(color, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegColor(s, color_dest, color); + + // Alpha Combiner + uint alpha_a = bitfieldExtract(ss.ac, 13, 3); + uint alpha_b = bitfieldExtract(ss.ac, 10, 3); + uint alpha_c = bitfieldExtract(ss.ac, 7, 3); + uint alpha_d = bitfieldExtract(ss.ac, 4, 3); + uint alpha_bias = bitfieldExtract(ss.ac, 16, 2); + bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1)); + bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1)); + uint alpha_shift = bitfieldExtract(ss.ac, 20, 2); + uint alpha_dest = bitfieldExtract(ss.ac, 22, 2); + uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op); + + int alpha_A; + int alpha_B; + if (alpha_bias != 3u || alpha_compare_op > 5u) { + // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5 + alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255; + alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255; + }; + int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255; + int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign + + + int alpha; + if(alpha_bias != 3u) { // Normal mode + alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift); + } else { // Compare mode + if (alpha_compare_op == 6u) { + // TEVCMP_A8_GT + alpha = (alpha_A > alpha_B) ? alpha_C : 0; + } else if (alpha_compare_op == 7u) { + // TEVCMP_A8_EQ + alpha = (alpha_A == alpha_B) ? alpha_C : 0; + } else { + // All remaining alpha compare ops actually compare the color channels + alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0; + } + alpha = alpha_D + alpha; + } + + // Clamp result + if (alpha_clamp) + alpha = clamp(alpha, 0, 255); + else + alpha = clamp(alpha, -1024, 1023); + + // Write result to the correct input register of the next stage + setRegAlpha(s, alpha_dest, alpha); + } + } // Main tev loop + + int4 TevResult; + TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz; + TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w; + TevResult &= 255; + + int zCoord = int(rawpos.z * 16777216.0); + zCoord = clamp(zCoord, 0, 0xFFFFFF); + + // ZFreeze + if ((bpmem_genmode & 524288u) != 0u) { + float2 screenpos = rawpos.xy * cefbscale.xy; + // Opengl has reversed vertical screenspace coordiantes + screenpos.y = 528.0 - screenpos.y; + zCoord = int(czslope.z + czslope.x * screenpos.x + czslope.y * screenpos.y); + } + + // Depth Texture + int early_zCoord = zCoord; + if (bpmem_ztex_op != 0u) { + int ztex = int(czbias[1].w); // fixed bias + + // Whatever texture was in our last stage, it's now our depth texture + ztex += idot(s.TexColor.xyzw, czbias[0].xyzw); + ztex += (bpmem_ztex_op == 1u) ? zCoord : 0; + zCoord = ztex & 0xFFFFFF; + } + + // If early depth is enabled, write to zbuffer before depth textures + // If early depth isn't enabled, we write to the zbuffer here + int zbuffer_zCoord = bpmem_late_ztest ? zCoord : early_zCoord; + depth = float(zbuffer_zCoord) / 16777216.0; + // Alpha Test + if (bpmem_alphaTest != 0u) { + bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3)); + bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3)); + + // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans. + switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) { + case 0u: // AND + if (comp0 && comp1) break; else discard; break; + case 1u: // OR + if (comp0 || comp1) break; else discard; break; + case 2u: // XOR + if (comp0 != comp1) break; else discard; break; + case 3u: // XNOR + if (comp0 == comp1) break; else discard; break; + } + } + + if (bpmem_dither) { + // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering + // Here the matrix is encoded into the two factor constants + int2 dither = int2(rawpos.xy) & 1; + TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2); + } + + // Fog + uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3); + if (fog_function != 0u) { + // TODO: This all needs to be converted from float to fixed point + float ze; + if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) { + // perspective + // ze = A/(B - (Zs >> B_SHF) + ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w)); + } else { + // orthographic + // ze = a*Zs (here, no B_SHF) + ze = cfogf[1].x * float(zCoord) / 16777216.0; + } + + if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) { + // x_adjust = sqrt((x-center)^2 + k^2)/k + // ze *= x_adjust + // TODO Instead of this theoretical calculation, we should use the + // coefficient table given in the fog range BP registers! + float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x; + x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z; + ze *= x_adjust; + } + + float fog = clamp(ze - cfogf[1].z, 0.0, 1.0); + + if (fog_function > 3u) { + switch (fog_function) { + case 4u: + fog = 1.0 - exp2(-8.0 * fog); + break; + case 5u: + fog = 1.0 - exp2(-8.0 * fog * fog); + break; + case 6u: + fog = exp2(-8.0 * (1.0 - fog)); + break; + case 7u: + fog = 1.0 - fog; + fog = exp2(-8.0 * fog * fog); + break; + } + } + + int ifog = iround(fog * 256.0); + TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8; + } + + if (bpmem_rgba6_format) + ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0; + else + ocol0.rgb = float3(TevResult.rgb) / 255.0; + + if (bpmem_dstalpha != 0u) + ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0; + else + ocol0.a = float(TevResult.a >> 2) / 63.0; + + // Dest alpha override (dual source blending) + // Colors will be blended against the alpha from ocol1 and + // the alpha from ocol0 will be written to the framebuffer. + ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0); +} + +int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) { + // Select Ras for stage + uint ras = bitfieldExtract(ss.order, 7, 3); + if (ras < 2u) { // Lighting Channel 0 or 1 + int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0); + uint swap = bitfieldExtract(ss.ac, 0, 2); + return Swizzle(swap, color); + } else if (ras == 5u) { // Alpha Bumb + return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump); + } else if (ras == 6u) { // Normalzied Alpha Bump + int normalized = s.AlphaBump | s.AlphaBump >> 5; + return int4(normalized, normalized, normalized, normalized); + } else { + return int4(0, 0, 0, 0); + } +} + +int4 getKonstColor(State s, StageState ss) { + // Select Konst for stage + // TODO: a switch case might be better here than an dynamically // indexed uniform lookup + uint tevksel = bpmem_tevksel(ss.stage>>1); + if ((ss.stage & 1u) == 0u) + return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a); + else + return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a); +} + |