summaryrefslogtreecommitdiff
path: root/thirdparty
diff options
context:
space:
mode:
authorJosé Fonseca <jfonseca@vmware.com>2013-03-06 11:46:41 +0000
committerJosé Fonseca <jfonseca@vmware.com>2013-03-06 11:46:41 +0000
commitd7c738e13decf8a8a891008c51b437ccbe3434fb (patch)
tree22f57c3aeef864841e466b6328a851127bff72d3 /thirdparty
parent4f9982f5ec3dccae65d5a49dfd5a81b9737d90cd (diff)
parent1e40126d7a03c43538a07e37e2e63d8882c07e7c (diff)
Merge branch 'directxtex'
Diffstat (limited to 'thirdparty')
-rw-r--r--thirdparty/directxtex/CMakeLists.txt54
-rw-r--r--thirdparty/directxtex/DirectXTex/BC.cpp1131
-rw-r--r--thirdparty/directxtex/DirectXTex/BC.h897
-rw-r--r--thirdparty/directxtex/DirectXTex/BC4BC5.cpp534
-rw-r--r--thirdparty/directxtex/DirectXTex/BC6HBC7.cpp2822
-rw-r--r--thirdparty/directxtex/DirectXTex/DDS.h214
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTex.h466
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTex.inl223
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTexCompress.cpp697
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTexConvert.cpp2420
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTexD3D11.cpp820
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTexDDS.cpp1684
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTexFlipRotate.cpp327
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTexImage.cpp674
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTexMipmaps.cpp1167
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTexMisc.cpp265
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTexNormalMaps.cpp377
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTexP.h199
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTexResize.cpp358
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTexTGA.cpp1387
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTexUtil.cpp759
-rw-r--r--thirdparty/directxtex/DirectXTex/DirectXTexWIC.cpp946
-rw-r--r--thirdparty/directxtex/DirectXTex/scoped.h70
-rw-r--r--thirdparty/directxtex/Microsoft Public License.rtf234
-rw-r--r--thirdparty/directxtex/ReadMe.txt192
-rw-r--r--thirdparty/directxtex/XNAMath/xnamath.h3397
-rw-r--r--thirdparty/directxtex/XNAMath/xnamathconvert.inl6383
-rw-r--r--thirdparty/directxtex/XNAMath/xnamathmatrix.inl3293
-rw-r--r--thirdparty/directxtex/XNAMath/xnamathmisc.inl2460
-rw-r--r--thirdparty/directxtex/XNAMath/xnamathvector.inl13673
-rw-r--r--thirdparty/directxtex/mingw/guid.cpp5
-rw-r--r--thirdparty/directxtex/mingw/sal.h294
32 files changed, 48422 insertions, 0 deletions
diff --git a/thirdparty/directxtex/CMakeLists.txt b/thirdparty/directxtex/CMakeLists.txt
new file mode 100644
index 00000000..14266404
--- /dev/null
+++ b/thirdparty/directxtex/CMakeLists.txt
@@ -0,0 +1,54 @@
+include (CheckIncludeFileCXX)
+
+if (DirectX_D3D11_INCLUDE_DIR)
+
+ include_directories (BEFORE
+ ${DirectX_D3D11_INCLUDE_DIR}
+ )
+
+ CHECK_INCLUDE_FILE_CXX (directxmath.h HAVE_DIRECTXMATH)
+ if (NOT HAVE_DIRECTXMATH)
+ include_directories (BEFORE
+ ${CMAKE_CURRENT_SOURCE_DIR}/XNAMath
+ )
+ add_definitions (-DUSE_XNAMATH)
+ endif ()
+
+ if (MINGW)
+ include_directories (BEFORE
+ ${CMAKE_CURRENT_SOURCE_DIR}/mingw
+ ${CMAKE_SOURCE_DIR}/dispatch # for compat.h
+ )
+ add_definitions (
+ -std=c++0x # static_assert, nullptr unique_ptr
+ -w #inhibit warnings
+
+ -D_XM_NO_INTRINSICS_
+ -D_XM_X86_
+ -DXMINLINE=inline
+ -DXM_NO_ALIGNMENT
+ )
+ set (MINGW_GUID mingw/guid.cpp)
+ endif ()
+
+ add_library (directxtex STATIC
+ DirectXTex/BC4BC5.cpp
+ DirectXTex/BC6HBC7.cpp
+ DirectXTex/BC.cpp
+ DirectXTex/DirectXTexCompress.cpp
+ DirectXTex/DirectXTexConvert.cpp
+ #DirectXTex/DirectXTexD3D11.cpp
+ #DirectXTex/DirectXTexDDS.cpp
+ #DirectXTex/DirectXTexFlipRotate.cpp
+ DirectXTex/DirectXTexImage.cpp
+ DirectXTex/DirectXTexMipmaps.cpp
+ DirectXTex/DirectXTexMisc.cpp
+ #DirectXTex/DirectXTexNormalMaps.cpp
+ #DirectXTex/DirectXTexResize.cpp
+ DirectXTex/DirectXTexTGA.cpp
+ DirectXTex/DirectXTexUtil.cpp
+ DirectXTex/DirectXTexWIC.cpp
+ ${MINGW_GUID}
+ )
+
+endif ()
diff --git a/thirdparty/directxtex/DirectXTex/BC.cpp b/thirdparty/directxtex/DirectXTex/BC.cpp
new file mode 100644
index 00000000..c47296df
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/BC.cpp
@@ -0,0 +1,1131 @@
+//-------------------------------------------------------------------------------------
+// BC.cpp
+//
+// Block-compression (BC) functionality for BC1, BC2, BC3 (orginal DXTn formats)
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+// Experiemental encoding variants, not enabled by default
+//#define COLOR_WEIGHTS
+//#define COLOR_AVG_0WEIGHTS
+
+#include "BC.h"
+
+namespace DirectX
+{
+
+//-------------------------------------------------------------------------------------
+// Constants
+//-------------------------------------------------------------------------------------
+
+// Perceptual weightings for the importance of each channel.
+static const HDRColorA g_Luminance (0.2125f / 0.7154f, 1.0f, 0.0721f / 0.7154f, 1.0f);
+static const HDRColorA g_LuminanceInv(0.7154f / 0.2125f, 1.0f, 0.7154f / 0.0721f, 1.0f);
+
+//-------------------------------------------------------------------------------------
+// Decode/Encode RGB 5/6/5 colors
+//-------------------------------------------------------------------------------------
+inline static void Decode565(_Out_ HDRColorA *pColor, _In_ const uint16_t w565)
+{
+ pColor->r = (float) ((w565 >> 11) & 31) * (1.0f / 31.0f);
+ pColor->g = (float) ((w565 >> 5) & 63) * (1.0f / 63.0f);
+ pColor->b = (float) ((w565 >> 0) & 31) * (1.0f / 31.0f);
+ pColor->a = 1.0f;
+}
+
+inline static uint16_t Encode565(_In_ const HDRColorA *pColor)
+{
+ HDRColorA Color;
+
+ Color.r = (pColor->r < 0.0f) ? 0.0f : (pColor->r > 1.0f) ? 1.0f : pColor->r;
+ Color.g = (pColor->g < 0.0f) ? 0.0f : (pColor->g > 1.0f) ? 1.0f : pColor->g;
+ Color.b = (pColor->b < 0.0f) ? 0.0f : (pColor->b > 1.0f) ? 1.0f : pColor->b;
+
+ uint16_t w;
+
+ w = (uint16_t) ((static_cast<int32_t>(Color.r * 31.0f + 0.5f) << 11) |
+ (static_cast<int32_t>(Color.g * 63.0f + 0.5f) << 5) |
+ (static_cast<int32_t>(Color.b * 31.0f + 0.5f) << 0));
+
+ return w;
+}
+
+
+//-------------------------------------------------------------------------------------
+static void OptimizeRGB(_Out_ HDRColorA *pX, _Out_ HDRColorA *pY,
+ _In_count_c_(NUM_PIXELS_PER_BLOCK) const HDRColorA *pPoints, _In_ size_t cSteps, _In_ DWORD flags)
+{
+ static const float fEpsilon = (0.25f / 64.0f) * (0.25f / 64.0f);
+ static const float pC3[] = { 2.0f/2.0f, 1.0f/2.0f, 0.0f/2.0f };
+ static const float pD3[] = { 0.0f/2.0f, 1.0f/2.0f, 2.0f/2.0f };
+ static const float pC4[] = { 3.0f/3.0f, 2.0f/3.0f, 1.0f/3.0f, 0.0f/3.0f };
+ static const float pD4[] = { 0.0f/3.0f, 1.0f/3.0f, 2.0f/3.0f, 3.0f/3.0f };
+
+ const float *pC = (3 == cSteps) ? pC3 : pC4;
+ const float *pD = (3 == cSteps) ? pD3 : pD4;
+
+ // Find Min and Max points, as starting point
+ HDRColorA X = (flags & BC_FLAGS_UNIFORM) ? HDRColorA(1.f, 1.f, 1.f, 1.f) : g_Luminance;
+ HDRColorA Y = HDRColorA(0.0f, 0.0f, 0.0f, 1.0f);
+
+ for(size_t iPoint = 0; iPoint < NUM_PIXELS_PER_BLOCK; iPoint++)
+ {
+#ifdef COLOR_WEIGHTS
+ if(pPoints[iPoint].a > 0.0f)
+#endif // COLOR_WEIGHTS
+ {
+ if(pPoints[iPoint].r < X.r)
+ X.r = pPoints[iPoint].r;
+
+ if(pPoints[iPoint].g < X.g)
+ X.g = pPoints[iPoint].g;
+
+ if(pPoints[iPoint].b < X.b)
+ X.b = pPoints[iPoint].b;
+
+ if(pPoints[iPoint].r > Y.r)
+ Y.r = pPoints[iPoint].r;
+
+ if(pPoints[iPoint].g > Y.g)
+ Y.g = pPoints[iPoint].g;
+
+ if(pPoints[iPoint].b > Y.b)
+ Y.b = pPoints[iPoint].b;
+ }
+ }
+
+ // Diagonal axis
+ HDRColorA AB;
+
+ AB.r = Y.r - X.r;
+ AB.g = Y.g - X.g;
+ AB.b = Y.b - X.b;
+
+ float fAB = AB.r * AB.r + AB.g * AB.g + AB.b * AB.b;
+
+ // Single color block.. no need to root-find
+ if(fAB < FLT_MIN)
+ {
+ pX->r = X.r; pX->g = X.g; pX->b = X.b;
+ pY->r = Y.r; pY->g = Y.g; pY->b = Y.b;
+ return;
+ }
+
+ // Try all four axis directions, to determine which diagonal best fits data
+ float fABInv = 1.0f / fAB;
+
+ HDRColorA Dir;
+ Dir.r = AB.r * fABInv;
+ Dir.g = AB.g * fABInv;
+ Dir.b = AB.b * fABInv;
+
+ HDRColorA Mid;
+ Mid.r = (X.r + Y.r) * 0.5f;
+ Mid.g = (X.g + Y.g) * 0.5f;
+ Mid.b = (X.b + Y.b) * 0.5f;
+
+ float fDir[4];
+ fDir[0] = fDir[1] = fDir[2] = fDir[3] = 0.0f;
+
+
+ for(size_t iPoint = 0; iPoint < NUM_PIXELS_PER_BLOCK; iPoint++)
+ {
+ HDRColorA Pt;
+ Pt.r = (pPoints[iPoint].r - Mid.r) * Dir.r;
+ Pt.g = (pPoints[iPoint].g - Mid.g) * Dir.g;
+ Pt.b = (pPoints[iPoint].b - Mid.b) * Dir.b;
+
+ float f;
+
+#ifdef COLOR_WEIGHTS
+ f = Pt.r + Pt.g + Pt.b;
+ fDir[0] += pPoints[iPoint].a * f * f;
+
+ f = Pt.r + Pt.g - Pt.b;
+ fDir[1] += pPoints[iPoint].a * f * f;
+
+ f = Pt.r - Pt.g + Pt.b;
+ fDir[2] += pPoints[iPoint].a * f * f;
+
+ f = Pt.r - Pt.g - Pt.b;
+ fDir[3] += pPoints[iPoint].a * f * f;
+#else
+ f = Pt.r + Pt.g + Pt.b;
+ fDir[0] += f * f;
+
+ f = Pt.r + Pt.g - Pt.b;
+ fDir[1] += f * f;
+
+ f = Pt.r - Pt.g + Pt.b;
+ fDir[2] += f * f;
+
+ f = Pt.r - Pt.g - Pt.b;
+ fDir[3] += f * f;
+#endif // COLOR_WEIGHTS
+ }
+
+ float fDirMax = fDir[0];
+ size_t iDirMax = 0;
+
+ for(size_t iDir = 1; iDir < 4; iDir++)
+ {
+ if(fDir[iDir] > fDirMax)
+ {
+ fDirMax = fDir[iDir];
+ iDirMax = iDir;
+ }
+ }
+
+ if(iDirMax & 2)
+ {
+ float f = X.g; X.g = Y.g; Y.g = f;
+ }
+
+ if(iDirMax & 1)
+ {
+ float f = X.b; X.b = Y.b; Y.b = f;
+ }
+
+
+ // Two color block.. no need to root-find
+ if(fAB < 1.0f / 4096.0f)
+ {
+ pX->r = X.r; pX->g = X.g; pX->b = X.b;
+ pY->r = Y.r; pY->g = Y.g; pY->b = Y.b;
+ return;
+ }
+
+
+ // Use Newton's Method to find local minima of sum-of-squares error.
+ float fSteps = (float) (cSteps - 1);
+
+ for(size_t iIteration = 0; iIteration < 8; iIteration++)
+ {
+ // Calculate new steps
+ HDRColorA pSteps[4];
+
+ for(size_t iStep = 0; iStep < cSteps; iStep++)
+ {
+ pSteps[iStep].r = X.r * pC[iStep] + Y.r * pD[iStep];
+ pSteps[iStep].g = X.g * pC[iStep] + Y.g * pD[iStep];
+ pSteps[iStep].b = X.b * pC[iStep] + Y.b * pD[iStep];
+ }
+
+
+ // Calculate color direction
+ Dir.r = Y.r - X.r;
+ Dir.g = Y.g - X.g;
+ Dir.b = Y.b - X.b;
+
+ float fLen = (Dir.r * Dir.r + Dir.g * Dir.g + Dir.b * Dir.b);
+
+ if(fLen < (1.0f / 4096.0f))
+ break;
+
+ float fScale = fSteps / fLen;
+
+ Dir.r *= fScale;
+ Dir.g *= fScale;
+ Dir.b *= fScale;
+
+
+ // Evaluate function, and derivatives
+ float d2X, d2Y;
+ HDRColorA dX, dY;
+ d2X = d2Y = dX.r = dX.g = dX.b = dY.r = dY.g = dY.b = 0.0f;
+
+ for(size_t iPoint = 0; iPoint < NUM_PIXELS_PER_BLOCK; iPoint++)
+ {
+ float fDot = (pPoints[iPoint].r - X.r) * Dir.r +
+ (pPoints[iPoint].g - X.g) * Dir.g +
+ (pPoints[iPoint].b - X.b) * Dir.b;
+
+
+ size_t iStep;
+ if(fDot <= 0.0f)
+ iStep = 0;
+ if(fDot >= fSteps)
+ iStep = cSteps - 1;
+ else
+ iStep = static_cast<size_t>(fDot + 0.5f);
+
+
+ HDRColorA Diff;
+ Diff.r = pSteps[iStep].r - pPoints[iPoint].r;
+ Diff.g = pSteps[iStep].g - pPoints[iPoint].g;
+ Diff.b = pSteps[iStep].b - pPoints[iPoint].b;
+
+#ifdef COLOR_WEIGHTS
+ float fC = pC[iStep] * pPoints[iPoint].a * (1.0f / 8.0f);
+ float fD = pD[iStep] * pPoints[iPoint].a * (1.0f / 8.0f);
+#else
+ float fC = pC[iStep] * (1.0f / 8.0f);
+ float fD = pD[iStep] * (1.0f / 8.0f);
+#endif // COLOR_WEIGHTS
+
+ d2X += fC * pC[iStep];
+ dX.r += fC * Diff.r;
+ dX.g += fC * Diff.g;
+ dX.b += fC * Diff.b;
+
+ d2Y += fD * pD[iStep];
+ dY.r += fD * Diff.r;
+ dY.g += fD * Diff.g;
+ dY.b += fD * Diff.b;
+ }
+
+
+ // Move endpoints
+ if(d2X > 0.0f)
+ {
+ float f = -1.0f / d2X;
+
+ X.r += dX.r * f;
+ X.g += dX.g * f;
+ X.b += dX.b * f;
+ }
+
+ if(d2Y > 0.0f)
+ {
+ float f = -1.0f / d2Y;
+
+ Y.r += dY.r * f;
+ Y.g += dY.g * f;
+ Y.b += dY.b * f;
+ }
+
+ if((dX.r * dX.r < fEpsilon) && (dX.g * dX.g < fEpsilon) && (dX.b * dX.b < fEpsilon) &&
+ (dY.r * dY.r < fEpsilon) && (dY.g * dY.g < fEpsilon) && (dY.b * dY.b < fEpsilon))
+ {
+ break;
+ }
+ }
+
+ pX->r = X.r; pX->g = X.g; pX->b = X.b;
+ pY->r = Y.r; pY->g = Y.g; pY->b = Y.b;
+}
+
+
+//-------------------------------------------------------------------------------------
+inline static void DecodeBC1( _Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_ const D3DX_BC1 *pBC )
+{
+ assert( pColor && pBC );
+ static_assert( sizeof(D3DX_BC1) == 8, "D3DX_BC1 should be 8 bytes" );
+
+ static XMVECTORF32 s_Scale = { 1.f/31.f, 1.f/63.f, 1.f/31.f, 1.f };
+
+ XMVECTOR clr0 = XMLoadU565( reinterpret_cast<const XMU565*>(&pBC->rgb[0]) );
+ XMVECTOR clr1 = XMLoadU565( reinterpret_cast<const XMU565*>(&pBC->rgb[1]) );
+
+ clr0 = XMVectorMultiply( clr0, s_Scale );
+ clr1 = XMVectorMultiply( clr1, s_Scale );
+
+ clr0 = XMVectorSwizzle( clr0, 2, 1, 0, 3 );
+ clr1 = XMVectorSwizzle( clr1, 2, 1, 0, 3 );
+
+ clr0 = XMVectorSelect( g_XMIdentityR3, clr0, g_XMSelect1110 );
+ clr1 = XMVectorSelect( g_XMIdentityR3, clr1, g_XMSelect1110 );
+
+ XMVECTOR clr2, clr3;
+ if(pBC->rgb[0] <= pBC->rgb[1])
+ {
+ clr2 = XMVectorLerp( clr0, clr1, 0.5f );
+ clr3 = XMVectorZero(); // Alpha of 0
+ }
+ else
+ {
+ clr2 = XMVectorLerp( clr0, clr1, 1.f/3.f );
+ clr3 = XMVectorLerp( clr0, clr1, 2.f/3.f );
+ }
+
+ uint32_t dw = pBC->bitmap;
+
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i, dw >>= 2)
+ {
+ switch(dw & 3)
+ {
+ case 0: pColor[i] = clr0; break;
+ case 1: pColor[i] = clr1; break;
+ case 2: pColor[i] = clr2; break;
+
+ case 3:
+ default: pColor[i] = clr3; break;
+ }
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+#pragma warning(disable: 4616 6001 6201)
+
+static void EncodeBC1(_Out_ D3DX_BC1 *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const HDRColorA *pColor,
+ _In_ bool bColorKey, _In_ float alphaRef, _In_ DWORD flags)
+{
+ assert( pBC && pColor );
+ static_assert( sizeof(D3DX_BC1) == 8, "D3DX_BC1 should be 8 bytes" );
+
+ // Determine if we need to colorkey this block
+ size_t uSteps;
+
+ if (bColorKey)
+ {
+ size_t uColorKey = 0;
+
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ if(pColor[i].a < alphaRef)
+ uColorKey++;
+ }
+
+ if(NUM_PIXELS_PER_BLOCK == uColorKey)
+ {
+ pBC->rgb[0] = 0x0000;
+ pBC->rgb[1] = 0xffff;
+ pBC->bitmap = 0xffffffff;
+ return;
+ }
+
+ uSteps = (uColorKey > 0) ? 3 : 4;
+ }
+ else
+ {
+ uSteps = 4;
+ }
+
+ // Quantize block to R56B5, using Floyd Stienberg error diffusion. This
+ // increases the chance that colors will map directly to the quantized
+ // axis endpoints.
+ HDRColorA Color[NUM_PIXELS_PER_BLOCK];
+ HDRColorA Error[NUM_PIXELS_PER_BLOCK];
+
+ if (flags & BC_FLAGS_DITHER_RGB)
+ memset(Error, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(HDRColorA));
+
+ size_t i;
+ for(i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ HDRColorA Clr;
+ Clr.r = pColor[i].r;
+ Clr.g = pColor[i].g;
+ Clr.b = pColor[i].b;
+
+ if (flags & BC_FLAGS_DITHER_RGB)
+ {
+ Clr.r += Error[i].r;
+ Clr.g += Error[i].g;
+ Clr.b += Error[i].b;
+ }
+
+ Color[i].r = (float) static_cast<int32_t>(Clr.r * 31.0f + 0.5f) * (1.0f / 31.0f);
+ Color[i].g = (float) static_cast<int32_t>(Clr.g * 63.0f + 0.5f) * (1.0f / 63.0f);
+ Color[i].b = (float) static_cast<int32_t>(Clr.b * 31.0f + 0.5f) * (1.0f / 31.0f);
+
+#ifdef COLOR_WEIGHTS
+ Color[i].a = pColor[i].a;
+#else
+ Color[i].a = 1.0f;
+#endif // COLOR_WEIGHTS
+
+ if (flags & BC_FLAGS_DITHER_RGB)
+ {
+ HDRColorA Diff;
+ Diff.r = Color[i].a * (Clr.r - Color[i].r);
+ Diff.g = Color[i].a * (Clr.g - Color[i].g);
+ Diff.b = Color[i].a * (Clr.b - Color[i].b);
+
+ if(3 != (i & 3))
+ {
+ assert( i < 15 );
+ __analysis_assume( i < 15 );
+ Error[i + 1].r += Diff.r * (7.0f / 16.0f);
+ Error[i + 1].g += Diff.g * (7.0f / 16.0f);
+ Error[i + 1].b += Diff.b * (7.0f / 16.0f);
+ }
+
+ if(i < 12)
+ {
+ if(i & 3)
+ {
+ Error[i + 3].r += Diff.r * (3.0f / 16.0f);
+ Error[i + 3].g += Diff.g * (3.0f / 16.0f);
+ Error[i + 3].b += Diff.b * (3.0f / 16.0f);
+ }
+
+ Error[i + 4].r += Diff.r * (5.0f / 16.0f);
+ Error[i + 4].g += Diff.g * (5.0f / 16.0f);
+ Error[i + 4].b += Diff.b * (5.0f / 16.0f);
+
+ if(3 != (i & 3))
+ {
+ assert( i < 11 );
+ __analysis_assume(i < 11 );
+ Error[i + 5].r += Diff.r * (1.0f / 16.0f);
+ Error[i + 5].g += Diff.g * (1.0f / 16.0f);
+ Error[i + 5].b += Diff.b * (1.0f / 16.0f);
+ }
+ }
+ }
+
+ if ( !( flags & BC_FLAGS_UNIFORM ) )
+ {
+ Color[i].r *= g_Luminance.r;
+ Color[i].g *= g_Luminance.g;
+ Color[i].b *= g_Luminance.b;
+ }
+ }
+
+ // Perform 6D root finding function to find two endpoints of color axis.
+ // Then quantize and sort the endpoints depending on mode.
+ HDRColorA ColorA, ColorB, ColorC, ColorD;
+
+ OptimizeRGB(&ColorA, &ColorB, Color, uSteps, flags);
+
+ if ( flags & BC_FLAGS_UNIFORM )
+ {
+ ColorC = ColorA;
+ ColorD = ColorB;
+ }
+ else
+ {
+ ColorC.r = ColorA.r * g_LuminanceInv.r;
+ ColorC.g = ColorA.g * g_LuminanceInv.g;
+ ColorC.b = ColorA.b * g_LuminanceInv.b;
+
+ ColorD.r = ColorB.r * g_LuminanceInv.r;
+ ColorD.g = ColorB.g * g_LuminanceInv.g;
+ ColorD.b = ColorB.b * g_LuminanceInv.b;
+ }
+
+ uint16_t wColorA = Encode565(&ColorC);
+ uint16_t wColorB = Encode565(&ColorD);
+
+ if((uSteps == 4) && (wColorA == wColorB))
+ {
+ pBC->rgb[0] = wColorA;
+ pBC->rgb[1] = wColorB;
+ pBC->bitmap = 0x00000000;
+ return;
+ }
+
+ Decode565(&ColorC, wColorA);
+ Decode565(&ColorD, wColorB);
+
+ if ( flags & BC_FLAGS_UNIFORM )
+ {
+ ColorA = ColorC;
+ ColorB = ColorD;
+ }
+ else
+ {
+ ColorA.r = ColorC.r * g_Luminance.r;
+ ColorA.g = ColorC.g * g_Luminance.g;
+ ColorA.b = ColorC.b * g_Luminance.b;
+
+ ColorB.r = ColorD.r * g_Luminance.r;
+ ColorB.g = ColorD.g * g_Luminance.g;
+ ColorB.b = ColorD.b * g_Luminance.b;
+ }
+
+ // Calculate color steps
+ HDRColorA Step[4];
+
+ if((3 == uSteps) == (wColorA <= wColorB))
+ {
+ pBC->rgb[0] = wColorA;
+ pBC->rgb[1] = wColorB;
+
+ Step[0] = ColorA;
+ Step[1] = ColorB;
+ }
+ else
+ {
+ pBC->rgb[0] = wColorB;
+ pBC->rgb[1] = wColorA;
+
+ Step[0] = ColorB;
+ Step[1] = ColorA;
+ }
+
+ static const size_t pSteps3[] = { 0, 2, 1 };
+ static const size_t pSteps4[] = { 0, 2, 3, 1 };
+ const size_t *pSteps;
+
+ if(3 == uSteps)
+ {
+ pSteps = pSteps3;
+
+ HDRColorALerp(&Step[2], &Step[0], &Step[1], 0.5f);
+ }
+ else
+ {
+ pSteps = pSteps4;
+
+ HDRColorALerp(&Step[2], &Step[0], &Step[1], 1.0f / 3.0f);
+ HDRColorALerp(&Step[3], &Step[0], &Step[1], 2.0f / 3.0f);
+ }
+
+ // Calculate color direction
+ HDRColorA Dir;
+
+ Dir.r = Step[1].r - Step[0].r;
+ Dir.g = Step[1].g - Step[0].g;
+ Dir.b = Step[1].b - Step[0].b;
+
+ float fSteps = (float) (uSteps - 1);
+ float fScale = (wColorA != wColorB) ? (fSteps / (Dir.r * Dir.r + Dir.g * Dir.g + Dir.b * Dir.b)) : 0.0f;
+
+ Dir.r *= fScale;
+ Dir.g *= fScale;
+ Dir.b *= fScale;
+
+ // Encode colors
+ uint32_t dw = 0;
+ if (flags & BC_FLAGS_DITHER_RGB)
+ memset(Error, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(HDRColorA));
+
+ for(i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ if((3 == uSteps) && (pColor[i].a < alphaRef))
+ {
+ dw = (3 << 30) | (dw >> 2);
+ }
+ else
+ {
+ HDRColorA Clr;
+ if ( flags & BC_FLAGS_UNIFORM )
+ {
+ Clr.r = pColor[i].r;
+ Clr.g = pColor[i].g;
+ Clr.b = pColor[i].b;
+ }
+ else
+ {
+ Clr.r = pColor[i].r * g_Luminance.r;
+ Clr.g = pColor[i].g * g_Luminance.g;
+ Clr.b = pColor[i].b * g_Luminance.b;
+ }
+
+ if (flags & BC_FLAGS_DITHER_RGB)
+ {
+ Clr.r += Error[i].r;
+ Clr.g += Error[i].g;
+ Clr.b += Error[i].b;
+ }
+
+ float fDot = (Clr.r - Step[0].r) * Dir.r + (Clr.g - Step[0].g) * Dir.g + (Clr.b - Step[0].b) * Dir.b;
+ uint32_t iStep;
+
+ if(fDot <= 0.0f)
+ iStep = 0;
+ else if(fDot >= fSteps)
+ iStep = 1;
+ else
+ iStep = static_cast<uint32_t>( pSteps[static_cast<size_t>(fDot + 0.5f)] );
+
+ dw = (iStep << 30) | (dw >> 2);
+
+ if (flags & BC_FLAGS_DITHER_RGB)
+ {
+ HDRColorA Diff;
+ Diff.r = Color[i].a * (Clr.r - Step[iStep].r);
+ Diff.g = Color[i].a * (Clr.g - Step[iStep].g);
+ Diff.b = Color[i].a * (Clr.b - Step[iStep].b);
+
+ if(3 != (i & 3))
+ {
+ Error[i + 1].r += Diff.r * (7.0f / 16.0f);
+ Error[i + 1].g += Diff.g * (7.0f / 16.0f);
+ Error[i + 1].b += Diff.b * (7.0f / 16.0f);
+ }
+
+ if(i < 12)
+ {
+ if(i & 3)
+ {
+ Error[i + 3].r += Diff.r * (3.0f / 16.0f);
+ Error[i + 3].g += Diff.g * (3.0f / 16.0f);
+ Error[i + 3].b += Diff.b * (3.0f / 16.0f);
+ }
+
+ Error[i + 4].r += Diff.r * (5.0f / 16.0f);
+ Error[i + 4].g += Diff.g * (5.0f / 16.0f);
+ Error[i + 4].b += Diff.b * (5.0f / 16.0f);
+
+ if(3 != (i & 3))
+ {
+ Error[i + 5].r += Diff.r * (1.0f / 16.0f);
+ Error[i + 5].g += Diff.g * (1.0f / 16.0f);
+ Error[i + 5].b += Diff.b * (1.0f / 16.0f);
+ }
+ }
+ }
+ }
+ }
+
+ pBC->bitmap = dw;
+}
+
+//-------------------------------------------------------------------------------------
+#ifdef COLOR_WEIGHTS
+static void EncodeSolidBC1(_Out_ D3DX_BC1 *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const HDRColorA *pColor)
+{
+#ifdef COLOR_AVG_0WEIGHTS
+ // Compute avg color
+ HDRColorA Color;
+ Color.r = pColor[0].r;
+ Color.g = pColor[0].g;
+ Color.b = pColor[0].b;
+
+ for(size_t i = 1; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ Color.r += pColor[i].r;
+ Color.g += pColor[i].g;
+ Color.b += pColor[i].b;
+ }
+
+ Color.r *= 1.0f / 16.0f;
+ Color.g *= 1.0f / 16.0f;
+ Color.b *= 1.0f / 16.0f;
+
+ uint16_t wColor = Encode565(&Color);
+#else
+ uint16_t wColor = 0x0000;
+#endif // COLOR_AVG_0WEIGHTS
+
+ // Encode solid block
+ pBC->rgb[0] = wColor;
+ pBC->rgb[1] = wColor;
+ pBC->bitmap = 0x00000000;
+}
+#endif // COLOR_WEIGHTS
+
+
+//=====================================================================================
+// Entry points
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// BC1 Compression
+//-------------------------------------------------------------------------------------
+void D3DXDecodeBC1(XMVECTOR *pColor, const uint8_t *pBC)
+{
+ const D3DX_BC1 *pBC1 = reinterpret_cast<const D3DX_BC1 *>(pBC);
+ DecodeBC1( pColor, pBC1 );
+}
+
+void D3DXEncodeBC1(uint8_t *pBC, const XMVECTOR *pColor, float alphaRef, DWORD flags)
+{
+ assert( pBC && pColor );
+
+ HDRColorA Color[NUM_PIXELS_PER_BLOCK];
+
+ if (flags & BC_FLAGS_DITHER_A)
+ {
+ float fError[NUM_PIXELS_PER_BLOCK];
+ memset(fError, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(float));
+
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ HDRColorA clr;
+ XMStoreFloat4( reinterpret_cast<XMFLOAT4*>( &clr ), pColor[i] );
+
+ float fAlph = clr.a + fError[i];
+
+ Color[i].r = clr.r;
+ Color[i].g = clr.g;
+ Color[i].b = clr.b;
+ Color[i].a = (float) static_cast<int32_t>(clr.a + fError[i] + 0.5f);
+
+ float fDiff = fAlph - Color[i].a;
+
+ if(3 != (i & 3))
+ {
+ assert( i < 15 );
+ __analysis_assume( i < 15 );
+ fError[i + 1] += fDiff * (7.0f / 16.0f);
+ }
+
+ if(i < 12)
+ {
+ if(i & 3)
+ fError[i + 3] += fDiff * (3.0f / 16.0f);
+
+ fError[i + 4] += fDiff * (5.0f / 16.0f);
+
+ if(3 != (i & 3))
+ {
+ assert( i < 11 );
+ __analysis_assume( i < 11 );
+ fError[i + 5] += fDiff * (1.0f / 16.0f);
+ }
+ }
+ }
+ }
+ else
+ {
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ XMStoreFloat4( reinterpret_cast<XMFLOAT4*>( &Color[i] ), pColor[i] );
+ }
+ }
+
+ D3DX_BC1 *pBC1 = reinterpret_cast<D3DX_BC1 *>(pBC);
+ EncodeBC1(pBC1, Color, true, alphaRef, flags);
+}
+
+
+//-------------------------------------------------------------------------------------
+// BC2 Compression
+//-------------------------------------------------------------------------------------
+void D3DXDecodeBC2(XMVECTOR *pColor, const uint8_t *pBC)
+{
+ assert( pColor && pBC );
+ static_assert( sizeof(D3DX_BC2) == 16, "D3DX_BC2 should be 16 bytes" );
+
+ const D3DX_BC2 *pBC2 = reinterpret_cast<const D3DX_BC2 *>(pBC);
+
+ // RGB part
+ DecodeBC1(pColor, &pBC2->bc1);
+
+ // 4-bit alpha part
+ DWORD dw = pBC2->bitmap[0];
+
+ for(size_t i = 0; i < 8; ++i, dw >>= 4)
+ pColor[i] = XMVectorSetW( pColor[i], (float) (dw & 0xf) * (1.0f / 15.0f) );
+
+ dw = pBC2->bitmap[1];
+
+ for(size_t i = 8; i < NUM_PIXELS_PER_BLOCK; ++i, dw >>= 4)
+ pColor[i] = XMVectorSetW( pColor[i], (float) (dw & 0xf) * (1.0f / 15.0f) );
+}
+
+void D3DXEncodeBC2(uint8_t *pBC, const XMVECTOR *pColor, DWORD flags)
+{
+ assert( pBC && pColor );
+ static_assert( sizeof(D3DX_BC2) == 16, "D3DX_BC2 should be 16 bytes" );
+
+ HDRColorA Color[NUM_PIXELS_PER_BLOCK];
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ XMStoreFloat4( reinterpret_cast<XMFLOAT4*>( &Color[i] ), pColor[i] );
+ }
+
+ D3DX_BC2 *pBC2 = reinterpret_cast<D3DX_BC2 *>(pBC);
+
+ // 4-bit alpha part. Dithered using Floyd Stienberg error diffusion.
+ pBC2->bitmap[0] = 0;
+ pBC2->bitmap[1] = 0;
+
+ float fError[NUM_PIXELS_PER_BLOCK];
+ if (flags & BC_FLAGS_DITHER_A)
+ memset(fError, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(float));
+
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ float fAlph = Color[i].a;
+ if (flags & BC_FLAGS_DITHER_A)
+ fAlph += fError[i];
+
+ uint32_t u = (uint32_t) static_cast<int32_t>(fAlph * 15.0f + 0.5f);
+
+ pBC2->bitmap[i >> 3] >>= 4;
+ pBC2->bitmap[i >> 3] |= (u << 28);
+
+ if (flags & BC_FLAGS_DITHER_A)
+ {
+ float fDiff = fAlph - (float) u * (1.0f / 15.0f);
+
+ if(3 != (i & 3))
+ {
+ assert( i < 15 );
+ __analysis_assume( i < 15 );
+ fError[i + 1] += fDiff * (7.0f / 16.0f);
+ }
+
+ if(i < 12)
+ {
+ if(i & 3)
+ fError[i + 3] += fDiff * (3.0f / 16.0f);
+
+ fError[i + 4] += fDiff * (5.0f / 16.0f);
+
+ if(3 != (i & 3))
+ {
+ assert( i < 11 );
+ __analysis_assume( i < 11 );
+ fError[i + 5] += fDiff * (1.0f / 16.0f);
+ }
+ }
+ }
+ }
+
+ // RGB part
+#ifdef COLOR_WEIGHTS
+ if(!pBC2->bitmap[0] && !pBC2->bitmap[1])
+ {
+ EncodeSolidBC1(pBC2->dxt1, Color);
+ return;
+ }
+#endif // COLOR_WEIGHTS
+
+ EncodeBC1(&pBC2->bc1, Color, false, 0.f, flags);
+}
+
+
+//-------------------------------------------------------------------------------------
+// BC3 Compression
+//-------------------------------------------------------------------------------------
+void D3DXDecodeBC3(XMVECTOR *pColor, const uint8_t *pBC)
+{
+ assert( pColor && pBC );
+ static_assert( sizeof(D3DX_BC3) == 16, "D3DX_BC3 should be 16 bytes" );
+
+ const D3DX_BC3 *pBC3 = reinterpret_cast<const D3DX_BC3 *>(pBC);
+
+ // RGB part
+ DecodeBC1(pColor, &pBC3->bc1);
+
+ // Adaptive 3-bit alpha part
+ float fAlpha[8];
+
+ fAlpha[0] = ((float) pBC3->alpha[0]) * (1.0f / 255.0f);
+ fAlpha[1] = ((float) pBC3->alpha[1]) * (1.0f / 255.0f);
+
+ if(pBC3->alpha[0] > pBC3->alpha[1])
+ {
+ for(size_t i = 1; i < 7; ++i)
+ fAlpha[i + 1] = (fAlpha[0] * (7 - i) + fAlpha[1] * i) * (1.0f / 7.0f);
+ }
+ else
+ {
+ for(size_t i = 1; i < 5; ++i)
+ fAlpha[i + 1] = (fAlpha[0] * (5 - i) + fAlpha[1] * i) * (1.0f / 5.0f);
+
+ fAlpha[6] = 0.0f;
+ fAlpha[7] = 1.0f;
+ }
+
+ DWORD dw = pBC3->bitmap[0] | (pBC3->bitmap[1] << 8) | (pBC3->bitmap[2] << 16);
+
+ for(size_t i = 0; i < 8; ++i, dw >>= 3)
+ pColor[i] = XMVectorSetW( pColor[i], fAlpha[dw & 0x7] );
+
+ dw = pBC3->bitmap[3] | (pBC3->bitmap[4] << 8) | (pBC3->bitmap[5] << 16);
+
+ for(size_t i = 8; i < NUM_PIXELS_PER_BLOCK; ++i, dw >>= 3)
+ pColor[i] = XMVectorSetW( pColor[i], fAlpha[dw & 0x7] );
+}
+
+void D3DXEncodeBC3(uint8_t *pBC, const XMVECTOR *pColor, DWORD flags)
+{
+ assert( pBC && pColor );
+ static_assert( sizeof(D3DX_BC3) == 16, "D3DX_BC3 should be 16 bytes" );
+
+ HDRColorA Color[NUM_PIXELS_PER_BLOCK];
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ XMStoreFloat4( reinterpret_cast<XMFLOAT4*>( &Color[i] ), pColor[i] );
+ }
+
+ D3DX_BC3 *pBC3 = reinterpret_cast<D3DX_BC3 *>(pBC);
+
+ // Quantize block to A8, using Floyd Stienberg error diffusion. This
+ // increases the chance that colors will map directly to the quantized
+ // axis endpoints.
+ float fAlpha[NUM_PIXELS_PER_BLOCK];
+ float fError[NUM_PIXELS_PER_BLOCK];
+
+ float fMinAlpha = Color[0].a;
+ float fMaxAlpha = Color[0].a;
+
+ if (flags & BC_FLAGS_DITHER_A)
+ memset(fError, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(float));
+
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ float fAlph = Color[i].a;
+ if (flags & BC_FLAGS_DITHER_A)
+ fAlph += fError[i];
+
+ fAlpha[i] = static_cast<int32_t>(fAlph * 255.0f + 0.5f) * (1.0f / 255.0f);
+
+ if(fAlpha[i] < fMinAlpha)
+ fMinAlpha = fAlpha[i];
+ else if(fAlpha[i] > fMaxAlpha)
+ fMaxAlpha = fAlpha[i];
+
+ if (flags & BC_FLAGS_DITHER_A)
+ {
+ float fDiff = fAlph - fAlpha[i];
+
+ if(3 != (i & 3))
+ {
+ assert( i < 15 );
+ __analysis_assume( i < 15 );
+ fError[i + 1] += fDiff * (7.0f / 16.0f);
+ }
+
+ if(i < 12)
+ {
+ if(i & 3)
+ fError[i + 3] += fDiff * (3.0f / 16.0f);
+
+ fError[i + 4] += fDiff * (5.0f / 16.0f);
+
+ if(3 != (i & 3))
+ {
+ assert( i < 11 );
+ __analysis_assume( i < 11 );
+ fError[i + 5] += fDiff * (1.0f / 16.0f);
+ }
+ }
+ }
+ }
+
+#ifdef COLOR_WEIGHTS
+ if(0.0f == fMaxAlpha)
+ {
+ EncodeSolidBC1(&pBC3->dxt1, Color);
+ pBC3->alpha[0] = 0x00;
+ pBC3->alpha[1] = 0x00;
+ memset(pBC3->bitmap, 0x00, 6);
+ }
+#endif
+
+ // RGB part
+ EncodeBC1(&pBC3->bc1, Color, false, 0.f, flags);
+
+ // Alpha part
+ if(1.0f == fMinAlpha)
+ {
+ pBC3->alpha[0] = 0xff;
+ pBC3->alpha[1] = 0xff;
+ memset(pBC3->bitmap, 0x00, 6);
+ return;
+ }
+
+ // Optimize and Quantize Min and Max values
+ size_t uSteps = ((0.0f == fMinAlpha) || (1.0f == fMaxAlpha)) ? 6 : 8;
+
+ float fAlphaA, fAlphaB;
+ OptimizeAlpha<false>(&fAlphaA, &fAlphaB, fAlpha, uSteps);
+
+ uint8_t bAlphaA = (uint8_t) static_cast<int32_t>(fAlphaA * 255.0f + 0.5f);
+ uint8_t bAlphaB = (uint8_t) static_cast<int32_t>(fAlphaB * 255.0f + 0.5f);
+
+ fAlphaA = (float) bAlphaA * (1.0f / 255.0f);
+ fAlphaB = (float) bAlphaB * (1.0f / 255.0f);
+
+ // Setup block
+ if((8 == uSteps) && (bAlphaA == bAlphaB))
+ {
+ pBC3->alpha[0] = bAlphaA;
+ pBC3->alpha[1] = bAlphaB;
+ memset(pBC3->bitmap, 0x00, 6);
+ return;
+ }
+
+ static const size_t pSteps6[] = { 0, 2, 3, 4, 5, 1 };
+ static const size_t pSteps8[] = { 0, 2, 3, 4, 5, 6, 7, 1 };
+
+ const size_t *pSteps;
+ float fStep[8];
+
+ if(6 == uSteps)
+ {
+ pBC3->alpha[0] = bAlphaA;
+ pBC3->alpha[1] = bAlphaB;
+
+ fStep[0] = fAlphaA;
+ fStep[1] = fAlphaB;
+
+ for(size_t i = 1; i < 5; ++i)
+ fStep[i + 1] = (fStep[0] * (5 - i) + fStep[1] * i) * (1.0f / 5.0f);
+
+ fStep[6] = 0.0f;
+ fStep[7] = 1.0f;
+
+ pSteps = pSteps6;
+ }
+ else
+ {
+ pBC3->alpha[0] = bAlphaB;
+ pBC3->alpha[1] = bAlphaA;
+
+ fStep[0] = fAlphaB;
+ fStep[1] = fAlphaA;
+
+ for(size_t i = 1; i < 7; ++i)
+ fStep[i + 1] = (fStep[0] * (7 - i) + fStep[1] * i) * (1.0f / 7.0f);
+
+ pSteps = pSteps8;
+ }
+
+ // Encode alpha bitmap
+ float fSteps = (float) (uSteps - 1);
+ float fScale = (fStep[0] != fStep[1]) ? (fSteps / (fStep[1] - fStep[0])) : 0.0f;
+
+ if (flags & BC_FLAGS_DITHER_A)
+ memset(fError, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(float));
+
+ for(size_t iSet = 0; iSet < 2; iSet++)
+ {
+ uint32_t dw = 0;
+
+ size_t iMin = iSet * 8;
+ size_t iLim = iMin + 8;
+
+ for(size_t i = iMin; i < iLim; ++i)
+ {
+ float fAlph = Color[i].a;
+ if (flags & BC_FLAGS_DITHER_A)
+ fAlph += fError[i];
+ float fDot = (fAlph - fStep[0]) * fScale;
+
+ uint32_t iStep;
+ if(fDot <= 0.0f)
+ iStep = ((6 == uSteps) && (fAlph <= fStep[0] * 0.5f)) ? 6 : 0;
+ else if(fDot >= fSteps)
+ iStep = ((6 == uSteps) && (fAlph >= (fStep[1] + 1.0f) * 0.5f)) ? 7 : 1;
+ else
+ iStep = static_cast<uint32_t>( pSteps[static_cast<size_t>(fDot + 0.5f)] );
+
+ dw = (iStep << 21) | (dw >> 3);
+
+ if (flags & BC_FLAGS_DITHER_A)
+ {
+ float fDiff = (fAlph - fStep[iStep]);
+
+ if(3 != (i & 3))
+ fError[i + 1] += fDiff * (7.0f / 16.0f);
+
+ if(i < 12)
+ {
+ if(i & 3)
+ fError[i + 3] += fDiff * (3.0f / 16.0f);
+
+ fError[i + 4] += fDiff * (5.0f / 16.0f);
+
+ if(3 != (i & 3))
+ fError[i + 5] += fDiff * (1.0f / 16.0f);
+ }
+ }
+ }
+
+ pBC3->bitmap[0 + iSet * 3] = ((uint8_t *) &dw)[0];
+ pBC3->bitmap[1 + iSet * 3] = ((uint8_t *) &dw)[1];
+ pBC3->bitmap[2 + iSet * 3] = ((uint8_t *) &dw)[2];
+ }
+}
+
+} // namespace \ No newline at end of file
diff --git a/thirdparty/directxtex/DirectXTex/BC.h b/thirdparty/directxtex/DirectXTex/BC.h
new file mode 100644
index 00000000..638058ea
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/BC.h
@@ -0,0 +1,897 @@
+//-------------------------------------------------------------------------------------
+// BC.h
+//
+// Block-compression (BC) functionality
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#if defined(_MSC_VER) && (_MSC_VER > 1000)
+#pragma once
+#endif
+
+#include <assert.h>
+
+#ifdef USE_XNAMATH
+#include <xnamath.h>
+#else
+#include <directxmath.h>
+#include <directxpackedvector.h>
+#endif
+
+#include <float.h>
+
+#pragma warning(push)
+#pragma warning(disable : 4005)
+#include <stdint.h>
+#pragma warning(pop)
+
+namespace DirectX
+{
+
+#ifndef USE_XNAMATH
+typedef PackedVector::HALF HALF;
+typedef PackedVector::XMHALF4 XMHALF4;
+typedef PackedVector::XMU565 XMU565;
+#endif
+
+//-------------------------------------------------------------------------------------
+// Constants
+//-------------------------------------------------------------------------------------
+
+const uint16_t F16S_MASK = 0x8000; // f16 sign mask
+const uint16_t F16EM_MASK = 0x7fff; // f16 exp & mantissa mask
+const uint16_t F16MAX = 0x7bff; // MAXFLT bit pattern for XMHALF
+
+#define SIGN_EXTEND(x,nb) ((((x)&(1<<((nb)-1)))?((~0)<<(nb)):0)|(x))
+
+// Because these are used in SAL annotations, they need to remain macros rather than const values
+#define NUM_PIXELS_PER_BLOCK 16
+#define BC6H_MAX_REGIONS 2
+#define BC6H_MAX_INDICES 16
+#define BC7_MAX_REGIONS 3
+#define BC7_MAX_INDICES 16
+
+const size_t BC6H_NUM_CHANNELS = 3;
+const size_t BC6H_MAX_SHAPES = 32;
+
+const size_t BC7_NUM_CHANNELS = 4;
+const size_t BC7_MAX_SHAPES = 64;
+
+const uint32_t BC67_WEIGHT_MAX = 64;
+const uint32_t BC67_WEIGHT_SHIFT = 6;
+const uint32_t BC67_WEIGHT_ROUND = 32;
+
+extern const int g_aWeights2[4];
+extern const int g_aWeights3[8];
+extern const int g_aWeights4[16];
+
+enum BC_FLAGS
+{
+ BC_FLAGS_NONE = 0x0,
+ BC_FLAGS_DITHER_RGB = 0x10000, // Enables dithering for RGB colors for BC1-3
+ BC_FLAGS_DITHER_A = 0x20000, // Enables dithering for Alpha channel for BC1-3
+ BC_FLAGS_UNIFORM = 0x40000, // By default, uses perceptual weighting for BC1-3; this flag makes it a uniform weighting
+};
+
+//-------------------------------------------------------------------------------------
+// Structures
+//-------------------------------------------------------------------------------------
+class HDRColorA;
+
+class LDRColorA
+{
+public:
+ uint8_t r, g, b, a;
+
+ LDRColorA() {}
+ LDRColorA(uint8_t _r, uint8_t _g, uint8_t _b, uint8_t _a) : r(_r), g(_g), b(_b), a(_a) {}
+
+ const uint8_t& operator [] (_In_range_(0,3) size_t uElement) const
+ {
+ switch(uElement)
+ {
+ case 0: return r;
+ case 1: return g;
+ case 2: return b;
+ case 3: return a;
+ default: assert(false); return r;
+ }
+ }
+
+ uint8_t& operator [] (_In_range_(0,3) size_t uElement)
+ {
+ switch(uElement)
+ {
+ case 0: return r;
+ case 1: return g;
+ case 2: return b;
+ case 3: return a;
+ default: assert(false); return r;
+ }
+ }
+
+ LDRColorA operator = (_In_ const HDRColorA& c);
+
+ static void InterpolateRGB(_In_ const LDRColorA& c0, _In_ const LDRColorA& c1, _In_ size_t wc, _In_ size_t wcprec, _Out_ LDRColorA& out)
+ {
+ const int* aWeights = nullptr;
+ switch(wcprec)
+ {
+ case 2: aWeights = g_aWeights2; assert( wc < 4 ); __analysis_assume( wc < 4 ); break;
+ case 3: aWeights = g_aWeights3; assert( wc < 8 ); __analysis_assume( wc < 8 ); break;
+ case 4: aWeights = g_aWeights4; assert( wc < 16 ); __analysis_assume( wc < 16 ); break;
+ default: assert(false); out.r = out.g = out.b = 0; return;
+ }
+ out.r = uint8_t((uint32_t(c0.r) * uint32_t(BC67_WEIGHT_MAX - aWeights[wc]) + uint32_t(c1.r) * uint32_t(aWeights[wc]) + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT);
+ out.g = uint8_t((uint32_t(c0.g) * uint32_t(BC67_WEIGHT_MAX - aWeights[wc]) + uint32_t(c1.g) * uint32_t(aWeights[wc]) + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT);
+ out.b = uint8_t((uint32_t(c0.b) * uint32_t(BC67_WEIGHT_MAX - aWeights[wc]) + uint32_t(c1.b) * uint32_t(aWeights[wc]) + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT);
+ }
+
+ static void InterpolateA(_In_ const LDRColorA& c0, _In_ const LDRColorA& c1, _In_ size_t wa, _In_ size_t waprec, _Out_ LDRColorA& out)
+ {
+ const int* aWeights = nullptr;
+ switch(waprec)
+ {
+ case 2: aWeights = g_aWeights2; assert( wa < 4 ); __analysis_assume( wa < 4 ); break;
+ case 3: aWeights = g_aWeights3; assert( wa < 8 ); __analysis_assume( wa < 8 ); break;
+ case 4: aWeights = g_aWeights4; assert( wa < 16 ); __analysis_assume( wa < 16 ); break;
+ default: assert(false); out.a = 0; return;
+ }
+ out.a = uint8_t((uint32_t(c0.a) * uint32_t(BC67_WEIGHT_MAX - aWeights[wa]) + uint32_t(c1.a) * uint32_t(aWeights[wa]) + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT);
+ }
+
+ static void Interpolate(_In_ const LDRColorA& c0, _In_ const LDRColorA& c1, _In_ size_t wc, _In_ size_t wa, _In_ size_t wcprec, _In_ size_t waprec, _Out_ LDRColorA& out)
+ {
+ InterpolateRGB(c0, c1, wc, wcprec, out);
+ InterpolateA(c0, c1, wa, waprec, out);
+ }
+};
+
+class HDRColorA
+{
+public:
+ float r, g, b, a;
+
+public:
+ HDRColorA() {}
+ HDRColorA(float _r, float _g, float _b, float _a) : r(_r), g(_g), b(_b), a(_a) {}
+ HDRColorA(const HDRColorA& c) : r(c.r), g(c.g), b(c.b), a(c.a) {}
+ HDRColorA(const LDRColorA& c)
+ {
+ r = float(c.r) * (1.0f/255.0f);
+ g = float(c.g) * (1.0f/255.0f);
+ b = float(c.b) * (1.0f/255.0f);
+ a = float(c.a) * (1.0f/255.0f);
+ }
+
+ // binary operators
+ HDRColorA operator + ( _In_ const HDRColorA& c ) const
+ {
+ return HDRColorA(r + c.r, g + c.g, b + c.b, a + c.a);
+ }
+
+ HDRColorA operator - ( _In_ const HDRColorA& c ) const
+ {
+ return HDRColorA(r - c.r, g - c.g, b - c.b, a - c.a);
+ }
+
+ HDRColorA operator * ( _In_ float f ) const
+ {
+ return HDRColorA(r * f, g * f, b * f, a * f);
+ }
+
+ HDRColorA operator / ( _In_ float f ) const
+ {
+ float fInv = 1.0f / f;
+ return HDRColorA(r * fInv, g * fInv, b * fInv, a * fInv);
+ }
+
+ float operator * ( _In_ const HDRColorA& c ) const
+ {
+ return r * c.r + g * c.g + b * c.b + a * c.a;
+ }
+
+ // assignment operators
+ HDRColorA& operator += ( _In_ const HDRColorA& c )
+ {
+ r += c.r;
+ g += c.g;
+ b += c.b;
+ a += c.a;
+ return *this;
+ }
+
+ HDRColorA& operator -= ( _In_ const HDRColorA& c )
+ {
+ r -= c.r;
+ g -= c.g;
+ b -= c.b;
+ a -= c.a;
+ return *this;
+ }
+
+ HDRColorA& operator *= ( _In_ float f )
+ {
+ r *= f;
+ g *= f;
+ b *= f;
+ a *= f;
+ return *this;
+ }
+
+ HDRColorA& operator /= ( _In_ float f )
+ {
+ float fInv = 1.0f / f;
+ r *= fInv;
+ g *= fInv;
+ b *= fInv;
+ a *= fInv;
+ return *this;
+ }
+
+ HDRColorA& operator = (_In_ const LDRColorA& c)
+ {
+ r = (float) c.r;
+ g = (float) c.g;
+ b = (float) c.b;
+ a = (float) c.a;
+ return *this;
+ }
+
+ HDRColorA& Clamp(_In_ float fMin, _In_ float fMax)
+ {
+ r = std::min<float>(fMax, std::max<float>(fMin, r));
+ g = std::min<float>(fMax, std::max<float>(fMin, g));
+ b = std::min<float>(fMax, std::max<float>(fMin, b));
+ a = std::min<float>(fMax, std::max<float>(fMin, a));
+ return *this;
+ }
+
+ LDRColorA ToLDRColorA() const
+ {
+ return LDRColorA((uint8_t) (r + 0.01f), (uint8_t) (g + 0.01f), (uint8_t) (b + 0.01f), (uint8_t) (a + 0.01f));
+ }
+};
+
+inline LDRColorA LDRColorA::operator = (_In_ const HDRColorA& c)
+{
+ LDRColorA ret;
+ HDRColorA tmp(c);
+ tmp = tmp.Clamp(0.0f, 1.0f) * 255.0f;
+ ret.r = uint8_t(tmp.r + 0.001f);
+ ret.g = uint8_t(tmp.g + 0.001f);
+ ret.b = uint8_t(tmp.b + 0.001f);
+ ret.a = uint8_t(tmp.a + 0.001f);
+ return ret;
+}
+
+struct LDREndPntPair
+{
+ LDRColorA A;
+ LDRColorA B;
+};
+
+struct HDREndPntPair
+{
+ HDRColorA A;
+ HDRColorA B;
+};
+
+inline HDRColorA* HDRColorALerp(_Out_ HDRColorA *pOut, _In_ const HDRColorA *pC1, _In_ const HDRColorA *pC2, _In_ float s)
+{
+ pOut->r = pC1->r + s * (pC2->r - pC1->r);
+ pOut->g = pC1->g + s * (pC2->g - pC1->g);
+ pOut->b = pC1->b + s * (pC2->b - pC1->b);
+ pOut->a = pC1->a + s * (pC2->a - pC1->a);
+ return pOut;
+}
+
+#pragma pack(push,1)
+// BC1/DXT1 compression (4 bits per texel)
+struct D3DX_BC1
+{
+ uint16_t rgb[2]; // 565 colors
+ uint32_t bitmap; // 2bpp rgb bitmap
+};
+
+// BC2/DXT2/3 compression (8 bits per texel)
+struct D3DX_BC2
+{
+ uint32_t bitmap[2]; // 4bpp alpha bitmap
+ D3DX_BC1 bc1; // BC1 rgb data
+};
+
+// BC3/DXT4/5 compression (8 bits per texel)
+struct D3DX_BC3
+{
+ uint8_t alpha[2]; // alpha values
+ uint8_t bitmap[6]; // 3bpp alpha bitmap
+ D3DX_BC1 bc1; // BC1 rgb data
+};
+#pragma pack(pop)
+
+class INTColor
+{
+public:
+ int r, g, b;
+
+public:
+ INTColor() {}
+ INTColor(int nr, int ng, int nb) {r = nr; g = ng; b = nb;}
+ INTColor(const INTColor& c) {r = c.r; g = c.g; b = c.b;}
+
+ INTColor operator - ( _In_ const INTColor& c ) const
+ {
+ return INTColor(r - c.r, g - c.g, b - c.b);
+ }
+
+ INTColor& operator += ( _In_ const INTColor& c )
+ {
+ r += c.r;
+ g += c.g;
+ b += c.b;
+ return *this;
+ }
+
+ INTColor& operator -= ( _In_ const INTColor& c )
+ {
+ r -= c.r;
+ g -= c.g;
+ b -= c.b;
+ return *this;
+ }
+
+ INTColor& operator &= ( _In_ const INTColor& c )
+ {
+ r &= c.r;
+ g &= c.g;
+ b &= c.b;
+ return *this;
+ }
+
+ int& operator [] ( _In_ uint8_t i )
+ {
+ assert(i < sizeof(INTColor) / sizeof(int));
+ __analysis_assume(i < sizeof(INTColor) / sizeof(int));
+ return ((int*) this)[i];
+ }
+
+ void Set(_In_ const HDRColorA& c, _In_ bool bSigned)
+ {
+ XMHALF4 aF16;
+
+ XMVECTOR v = XMLoadFloat4( (const XMFLOAT4*)& c );
+ XMStoreHalf4( &aF16, v );
+
+ r = F16ToINT(aF16.x, bSigned);
+ g = F16ToINT(aF16.y, bSigned);
+ b = F16ToINT(aF16.z, bSigned);
+ }
+
+ INTColor& Clamp(_In_ int iMin, _In_ int iMax)
+ {
+ r = std::min<int>(iMax, std::max<int>(iMin, r));
+ g = std::min<int>(iMax, std::max<int>(iMin, g));
+ b = std::min<int>(iMax, std::max<int>(iMin, b));
+ return *this;
+ }
+
+ INTColor& SignExtend(_In_ const LDRColorA& Prec)
+ {
+ r = SIGN_EXTEND(r, Prec.r);
+ g = SIGN_EXTEND(g, Prec.g);
+ b = SIGN_EXTEND(b, Prec.b);
+ return *this;
+ }
+
+ void ToF16(_Out_cap_c_(3) HALF aF16[3], _In_ bool bSigned) const
+ {
+ aF16[0] = INT2F16(r, bSigned);
+ aF16[1] = INT2F16(g, bSigned);
+ aF16[2] = INT2F16(b, bSigned);
+ }
+
+private:
+ static int F16ToINT(_In_ const HALF& f, _In_ bool bSigned)
+ {
+ uint16_t input = *((const uint16_t*) &f);
+ int out, s;
+ if(bSigned)
+ {
+ s = input & F16S_MASK;
+ input &= F16EM_MASK;
+ if(input > F16MAX) out = F16MAX;
+ else out = input;
+ out = s ? -out : out;
+ }
+ else
+ {
+ if(input & F16S_MASK) out = 0;
+ else out = input;
+ }
+ return out;
+ }
+
+ static HALF INT2F16(_In_ int input, _In_ bool bSigned)
+ {
+ HALF h;
+ uint16_t out;
+ if(bSigned)
+ {
+ int s = 0;
+ if(input < 0)
+ {
+ s = F16S_MASK;
+ input = -input;
+ }
+ out = uint16_t(s | input);
+ }
+ else
+ {
+ assert(input >= 0 && input <= F16MAX);
+ out = (uint16_t) input;
+ }
+
+ *((uint16_t*) &h) = out;
+ return h;
+ }
+};
+
+struct INTEndPntPair
+{
+ INTColor A;
+ INTColor B;
+};
+
+template< size_t SizeInBytes >
+class CBits
+{
+public:
+ uint8_t GetBit(_Inout_ size_t& uStartBit) const
+ {
+ assert(uStartBit < 128);
+ __analysis_assume(uStartBit < 128);
+ size_t uIndex = uStartBit >> 3;
+ uint8_t ret = (m_uBits[uIndex] >> (uStartBit - (uIndex << 3))) & 0x01;
+ uStartBit++;
+ return ret;
+ }
+
+ uint8_t GetBits(_Inout_ size_t& uStartBit, _In_ size_t uNumBits) const
+ {
+ if(uNumBits == 0) return 0;
+ assert(uStartBit + uNumBits <= 128 && uNumBits <= 8);
+ __analysis_assume(uStartBit + uNumBits <= 128 && uNumBits <= 8);
+ uint8_t ret;
+ size_t uIndex = uStartBit >> 3;
+ size_t uBase = uStartBit - (uIndex << 3);
+ if(uBase + uNumBits > 8)
+ {
+ size_t uFirstIndexBits = 8 - uBase;
+ size_t uNextIndexBits = uNumBits - uFirstIndexBits;
+ ret = (m_uBits[uIndex] >> uBase) | ((m_uBits[uIndex+1] & ((1 << uNextIndexBits) - 1)) << uFirstIndexBits);
+ }
+ else
+ {
+ ret = (m_uBits[uIndex] >> uBase) & ((1 << uNumBits) - 1);
+ }
+ assert(ret < (1 << uNumBits));
+ uStartBit += uNumBits;
+ return ret;
+ }
+
+ void SetBit(_Inout_ size_t& uStartBit, _In_ uint8_t uValue)
+ {
+ assert(uStartBit < 128 && uValue < 2);
+ __analysis_assume(uStartBit < 128 && uValue < 2);
+ size_t uIndex = uStartBit >> 3;
+ size_t uBase = uStartBit - (uIndex << 3);
+ m_uBits[uIndex] &= ~(1 << uBase);
+ m_uBits[uIndex] |= uValue << uBase;
+ uStartBit++;
+ }
+
+ void SetBits(_Inout_ size_t& uStartBit, _In_ size_t uNumBits, _In_ uint8_t uValue)
+ {
+ if(uNumBits == 0)
+ return;
+ assert(uStartBit + uNumBits <= 128 && uNumBits <= 8);
+ __analysis_assume(uStartBit + uNumBits <= 128 && uNumBits <= 8);
+ assert(uValue < (1 << uNumBits));
+ size_t uIndex = uStartBit >> 3;
+ size_t uBase = uStartBit - (uIndex << 3);
+ if(uBase + uNumBits > 8)
+ {
+ size_t uFirstIndexBits = 8 - uBase;
+ size_t uNextIndexBits = uNumBits - uFirstIndexBits;
+ m_uBits[uIndex] &= ~(((1 << uFirstIndexBits) - 1) << uBase);
+ m_uBits[uIndex] |= uValue << uBase;
+ m_uBits[uIndex+1] &= ~((1 << uNextIndexBits) - 1);
+ m_uBits[uIndex+1] |= uValue >> uFirstIndexBits;
+ }
+ else
+ {
+ m_uBits[uIndex] &= ~(((1 << uNumBits) - 1) << uBase);
+ m_uBits[uIndex] |= uValue << uBase;
+ }
+ uStartBit += uNumBits;
+ }
+
+private:
+ uint8_t m_uBits[ SizeInBytes ];
+};
+
+#pragma warning(push)
+#pragma warning(disable : 4127 4480 4512)
+
+// BC6H compression (16 bits per texel)
+class D3DX_BC6H : private CBits< 16 >
+{
+public:
+ void Decode(_In_ bool bSigned, _Out_cap_c_(NUM_PIXELS_PER_BLOCK) HDRColorA* pOut) const;
+ void Encode(_In_ bool bSigned, _In_count_c_(NUM_PIXELS_PER_BLOCK) const HDRColorA* const pIn);
+
+private:
+ enum EField : uint8_t
+ {
+ NA, // N/A
+ M, // Mode
+ D, // Shape
+ RW,
+ RX,
+ RY,
+ RZ,
+ GW,
+ GX,
+ GY,
+ GZ,
+ BW,
+ BX,
+ BY,
+ BZ,
+ };
+
+ struct ModeDescriptor
+ {
+ EField m_eField;
+ uint8_t m_uBit;
+ };
+
+ struct ModeInfo
+ {
+ uint8_t uMode;
+ uint8_t uPartitions;
+ bool bTransformed;
+ uint8_t uIndexPrec;
+ LDRColorA RGBAPrec[BC6H_MAX_REGIONS][2];
+ };
+
+ struct EncodeParams
+ {
+ float fBestErr;
+ const bool bSigned;
+ uint8_t uMode;
+ uint8_t uShape;
+ const HDRColorA* const aHDRPixels;
+ INTEndPntPair aUnqEndPts[BC6H_MAX_SHAPES][BC6H_MAX_REGIONS];
+ INTColor aIPixels[NUM_PIXELS_PER_BLOCK];
+
+ EncodeParams(const HDRColorA* const aOriginal, bool bSignedFormat) :
+ aHDRPixels(aOriginal), fBestErr(FLT_MAX), bSigned(bSignedFormat)
+ {
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ aIPixels[i].Set(aOriginal[i], bSigned);
+ }
+ }
+ };
+
+ static int Quantize(_In_ int iValue, _In_ int prec, _In_ bool bSigned);
+ static int Unquantize(_In_ int comp, _In_ uint8_t uBitsPerComp, _In_ bool bSigned);
+ static int FinishUnquantize(_In_ int comp, _In_ bool bSigned);
+
+ static bool EndPointsFit(_In_ const EncodeParams* pEP, _In_count_c_(BC6H_MAX_REGIONS) const INTEndPntPair aEndPts[]);
+
+ void GeneratePaletteQuantized(_In_ const EncodeParams* pEP, _In_ const INTEndPntPair& endPts,
+ _Out_cap_c_(BC6H_MAX_INDICES) INTColor aPalette[]) const;
+ float MapColorsQuantized(_In_ const EncodeParams* pEP, _In_count_(np) const INTColor aColors[], _In_ size_t np, _In_ const INTEndPntPair &endPts) const;
+ float PerturbOne(_In_ const EncodeParams* pEP, _In_count_(np) const INTColor aColors[], _In_ size_t np, _In_ uint8_t ch,
+ _In_ const INTEndPntPair& oldEndPts, _Out_ INTEndPntPair& newEndPts, _In_ float fOldErr, _In_ int do_b) const;
+ void OptimizeOne(_In_ const EncodeParams* pEP, _In_count_(np) const INTColor aColors[], _In_ size_t np, _In_ float aOrgErr,
+ _In_ const INTEndPntPair &aOrgEndPts, _Out_ INTEndPntPair &aOptEndPts) const;
+ void OptimizeEndPoints(_In_ const EncodeParams* pEP, _In_count_c_(BC6H_MAX_REGIONS) const float aOrgErr[],
+ _In_count_c_(BC6H_MAX_REGIONS) const INTEndPntPair aOrgEndPts[],
+ _Inout_count_c_(BC6H_MAX_REGIONS) INTEndPntPair aOptEndPts[]) const;
+ static void SwapIndices(_In_ const EncodeParams* pEP, _Inout_count_c_(BC6H_MAX_REGIONS) INTEndPntPair aEndPts[],
+ _In_count_c_(NUM_PIXELS_PER_BLOCK) size_t aIndices[]);
+ void AssignIndices(_In_ const EncodeParams* pEP, _In_count_c_(BC6H_MAX_REGIONS) const INTEndPntPair aEndPts[],
+ _Out_cap_c_(NUM_PIXELS_PER_BLOCK) size_t aIndices[],
+ _Out_cap_c_(BC6H_MAX_REGIONS) float aTotErr[]) const;
+ void QuantizeEndPts(_In_ const EncodeParams* pEP, _Out_cap_c_(BC6H_MAX_REGIONS) INTEndPntPair* qQntEndPts) const;
+ void EmitBlock(_In_ const EncodeParams* pEP, _In_count_c_(BC6H_MAX_REGIONS) const INTEndPntPair aEndPts[],
+ _In_count_c_(NUM_PIXELS_PER_BLOCK) const size_t aIndices[]);
+ void Refine(_Inout_ EncodeParams* pEP);
+
+ static void GeneratePaletteUnquantized(_In_ const EncodeParams* pEP, _In_ size_t uRegion, _Out_cap_c_(BC6H_MAX_INDICES) INTColor aPalette[]);
+ float MapColors(_In_ const EncodeParams* pEP, _In_ size_t uRegion, _In_ size_t np, _In_count_(np) const size_t* auIndex) const;
+ float RoughMSE(_Inout_ EncodeParams* pEP) const;
+
+private:
+ const static ModeDescriptor ms_aDesc[][82];
+ const static ModeInfo ms_aInfo[];
+ const static int ms_aModeToInfo[];
+};
+
+// BC67 compression (16b bits per texel)
+class D3DX_BC7 : private CBits< 16 >
+{
+public:
+ void Decode(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) HDRColorA* pOut) const;
+ void Encode(_In_count_c_(NUM_PIXELS_PER_BLOCK) const HDRColorA* const pIn);
+
+private:
+ struct ModeInfo
+ {
+ uint8_t uPartitions;
+ uint8_t uPartitionBits;
+ uint8_t uPBits;
+ uint8_t uRotationBits;
+ uint8_t uIndexModeBits;
+ uint8_t uIndexPrec;
+ uint8_t uIndexPrec2;
+ LDRColorA RGBAPrec;
+ LDRColorA RGBAPrecWithP;
+ };
+
+ struct EncodeParams
+ {
+ uint8_t uMode;
+ LDREndPntPair aEndPts[BC7_MAX_SHAPES][BC7_MAX_REGIONS];
+ LDRColorA aLDRPixels[NUM_PIXELS_PER_BLOCK];
+ const HDRColorA* const aHDRPixels;
+
+ EncodeParams(const HDRColorA* const aOriginal) : aHDRPixels(aOriginal) {}
+ };
+
+ static uint8_t Quantize(_In_ uint8_t comp, _In_ uint8_t uPrec)
+ {
+ assert(0 < uPrec && uPrec <= 8);
+ uint8_t rnd = (uint8_t) std::min<uint16_t>(255, uint16_t(comp) + (1 << (7 - uPrec)));
+ return rnd >> (8 - uPrec);
+ }
+
+ static LDRColorA Quantize(_In_ const LDRColorA& c, _In_ const LDRColorA& RGBAPrec)
+ {
+ LDRColorA q;
+ q.r = Quantize(c.r, RGBAPrec.r);
+ q.g = Quantize(c.g, RGBAPrec.g);
+ q.b = Quantize(c.b, RGBAPrec.b);
+ if(RGBAPrec.a)
+ q.a = Quantize(c.a, RGBAPrec.a);
+ else
+ q.a = 255;
+ return q;
+ }
+
+ static uint8_t Unquantize(_In_ uint8_t comp, _In_ size_t uPrec)
+ {
+ assert(0 < uPrec && uPrec <= 8);
+ comp = comp << (8 - uPrec);
+ return comp | (comp >> uPrec);
+ }
+
+ static LDRColorA Unquantize(_In_ const LDRColorA& c, _In_ const LDRColorA& RGBAPrec)
+ {
+ LDRColorA q;
+ q.r = Unquantize(c.r, RGBAPrec.r);
+ q.g = Unquantize(c.g, RGBAPrec.g);
+ q.b = Unquantize(c.b, RGBAPrec.b);
+ q.a = RGBAPrec.a > 0 ? Unquantize(c.a, RGBAPrec.a) : 255;
+ return q;
+ }
+
+ void GeneratePaletteQuantized(_In_ const EncodeParams* pEP, _In_ size_t uIndexMode, _In_ const LDREndPntPair& endpts,
+ _Out_cap_c_(BC7_MAX_INDICES) LDRColorA aPalette[]) const;
+ float PerturbOne(_In_ const EncodeParams* pEP, _In_count_(np) const LDRColorA colors[], _In_ size_t np, _In_ size_t uIndexMode,
+ _In_ size_t ch, _In_ const LDREndPntPair &old_endpts,
+ _Out_ LDREndPntPair &new_endpts, _In_ float old_err, _In_ uint8_t do_b) const;
+ void Exhaustive(_In_ const EncodeParams* pEP, _In_count_(np) const LDRColorA aColors[], _In_ size_t np, _In_ size_t uIndexMode,
+ _In_ size_t ch, _Inout_ float& fOrgErr, _Inout_ LDREndPntPair& optEndPt) const;
+ void OptimizeOne(_In_ const EncodeParams* pEP, _In_count_(np) const LDRColorA colors[], _In_ size_t np, _In_ size_t uIndexMode,
+ _In_ float orig_err, _In_ const LDREndPntPair &orig_endpts, _Out_ LDREndPntPair &opt_endpts) const;
+ void OptimizeEndPoints(_In_ const EncodeParams* pEP, _In_ size_t uShape, _In_ size_t uIndexMode,
+ _In_count_c_(BC7_MAX_REGIONS) const float orig_err[],
+ _In_count_c_(BC7_MAX_REGIONS) const LDREndPntPair orig_endpts[],
+ _Out_cap_c_(BC7_MAX_REGIONS) LDREndPntPair opt_endpts[]) const;
+ void AssignIndices(_In_ const EncodeParams* pEP, _In_ size_t uShape, _In_ size_t uIndexMode,
+ _In_count_c_(BC7_MAX_REGIONS) LDREndPntPair endpts[],
+ _Out_cap_c_(NUM_PIXELS_PER_BLOCK) size_t aIndices[], _Out_cap_c_(NUM_PIXELS_PER_BLOCK) size_t aIndices2[],
+ _Out_cap_c_(BC7_MAX_REGIONS) float afTotErr[]) const;
+ void EmitBlock(_In_ const EncodeParams* pEP, _In_ size_t uShape, _In_ size_t uRotation, _In_ size_t uIndexMode,
+ _In_count_c_(BC7_MAX_REGIONS) const LDREndPntPair aEndPts[],
+ _In_count_c_(NUM_PIXELS_PER_BLOCK) const size_t aIndex[],
+ _In_count_c_(NUM_PIXELS_PER_BLOCK) const size_t aIndex2[]);
+ float Refine(_In_ const EncodeParams* pEP, _In_ size_t uShape, _In_ size_t uRotation, _In_ size_t uIndexMode);
+
+ float MapColors(_In_ const EncodeParams* pEP, _In_count_(np) const LDRColorA aColors[], _In_ size_t np, _In_ size_t uIndexMode,
+ _In_ const LDREndPntPair& endPts, _In_ float fMinErr) const;
+ static float RoughMSE(_Inout_ EncodeParams* pEP, _In_ size_t uShape, _In_ size_t uIndexMode);
+
+private:
+ const static ModeInfo ms_aInfo[];
+};
+
+//-------------------------------------------------------------------------------------
+template <bool bRange> void OptimizeAlpha(float *pX, float *pY, const float *pPoints, size_t cSteps)
+{
+ static const float pC6[] = { 5.0f/5.0f, 4.0f/5.0f, 3.0f/5.0f, 2.0f/5.0f, 1.0f/5.0f, 0.0f/5.0f };
+ static const float pD6[] = { 0.0f/5.0f, 1.0f/5.0f, 2.0f/5.0f, 3.0f/5.0f, 4.0f/5.0f, 5.0f/5.0f };
+ static const float pC8[] = { 7.0f/7.0f, 6.0f/7.0f, 5.0f/7.0f, 4.0f/7.0f, 3.0f/7.0f, 2.0f/7.0f, 1.0f/7.0f, 0.0f/7.0f };
+ static const float pD8[] = { 0.0f/7.0f, 1.0f/7.0f, 2.0f/7.0f, 3.0f/7.0f, 4.0f/7.0f, 5.0f/7.0f, 6.0f/7.0f, 7.0f/7.0f };
+
+ const float *pC = (6 == cSteps) ? pC6 : pC8;
+ const float *pD = (6 == cSteps) ? pD6 : pD8;
+
+ float MAX_VALUE = 1.0f;
+ float MIN_VALUE;
+ if (bRange)
+ {
+ MIN_VALUE = -1.0f;
+ }
+ else
+ {
+ MIN_VALUE = 0.0f;
+ }
+
+ // Find Min and Max points, as starting point
+ float fX = MAX_VALUE;
+ float fY = MIN_VALUE;
+
+ if(8 == cSteps)
+ {
+ for(size_t iPoint = 0; iPoint < NUM_PIXELS_PER_BLOCK; iPoint++)
+ {
+ if(pPoints[iPoint] < fX)
+ fX = pPoints[iPoint];
+
+ if(pPoints[iPoint] > fY)
+ fY = pPoints[iPoint];
+ }
+ }
+ else
+ {
+ for(size_t iPoint = 0; iPoint < NUM_PIXELS_PER_BLOCK; iPoint++)
+ {
+ if(pPoints[iPoint] < fX && pPoints[iPoint] > MIN_VALUE)
+ fX = pPoints[iPoint];
+
+ if(pPoints[iPoint] > fY && pPoints[iPoint] < MAX_VALUE)
+ fY = pPoints[iPoint];
+ }
+
+ if (fX == fY)
+ {
+ fY = MAX_VALUE;
+ }
+ }
+
+ // Use Newton's Method to find local minima of sum-of-squares error.
+ float fSteps = (float) (cSteps - 1);
+
+ for(size_t iIteration = 0; iIteration < 8; iIteration++)
+ {
+ float fScale;
+
+ if((fY - fX) < (1.0f / 256.0f))
+ break;
+
+ fScale = fSteps / (fY - fX);
+
+ // Calculate new steps
+ float pSteps[8];
+
+ for(size_t iStep = 0; iStep < cSteps; iStep++)
+ pSteps[iStep] = pC[iStep] * fX + pD[iStep] * fY;
+
+ if(6 == cSteps)
+ {
+ pSteps[6] = MIN_VALUE;
+ pSteps[7] = MAX_VALUE;
+ }
+
+ // Evaluate function, and derivatives
+ float dX = 0.0f;
+ float dY = 0.0f;
+ float d2X = 0.0f;
+ float d2Y = 0.0f;
+
+ for(size_t iPoint = 0; iPoint < NUM_PIXELS_PER_BLOCK; iPoint++)
+ {
+ float fDot = (pPoints[iPoint] - fX) * fScale;
+
+ size_t iStep;
+
+ if(fDot <= 0.0f)
+ iStep = ((6 == cSteps) && (pPoints[iPoint] <= fX * 0.5f)) ? 6 : 0;
+ else if(fDot >= fSteps)
+ iStep = ((6 == cSteps) && (pPoints[iPoint] >= (fY + 1.0f) * 0.5f)) ? 7 : (cSteps - 1);
+ else
+ iStep = static_cast<int32_t>(fDot + 0.5f);
+
+
+ if(iStep < cSteps)
+ {
+ // D3DX had this computation backwards (pPoints[iPoint] - pSteps[iStep])
+ // this fix improves RMS of the alpha component
+ float fDiff = pSteps[iStep] - pPoints[iPoint];
+
+ dX += pC[iStep] * fDiff;
+ d2X += pC[iStep] * pC[iStep];
+
+ dY += pD[iStep] * fDiff;
+ d2Y += pD[iStep] * pD[iStep];
+ }
+ }
+
+ // Move endpoints
+ if(d2X > 0.0f)
+ fX -= dX / d2X;
+
+ if(d2Y > 0.0f)
+ fY -= dY / d2Y;
+
+ if(fX > fY)
+ {
+ float f = fX; fX = fY; fY = f;
+ }
+
+ if((dX * dX < (1.0f / 64.0f)) && (dY * dY < (1.0f / 64.0f)))
+ break;
+ }
+
+ *pX = (fX < MIN_VALUE) ? MIN_VALUE : (fX > MAX_VALUE) ? MAX_VALUE : fX;
+ *pY = (fY < MIN_VALUE) ? MIN_VALUE : (fY > MAX_VALUE) ? MAX_VALUE : fY;
+}
+#pragma warning(pop)
+
+
+//-------------------------------------------------------------------------------------
+// Functions
+//-------------------------------------------------------------------------------------
+
+typedef void (*BC_DECODE)(XMVECTOR *pColor, const uint8_t *pBC);
+typedef void (*BC_ENCODE)(uint8_t *pDXT, const XMVECTOR *pColor, DWORD flags);
+
+void D3DXDecodeBC1(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(8) const uint8_t *pBC);
+void D3DXDecodeBC2(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(16) const uint8_t *pBC);
+void D3DXDecodeBC3(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(16) const uint8_t *pBC);
+void D3DXDecodeBC4U(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(8) const uint8_t *pBC);
+void D3DXDecodeBC4S(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(8) const uint8_t *pBC);
+void D3DXDecodeBC5U(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(16) const uint8_t *pBC);
+void D3DXDecodeBC5S(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(16) const uint8_t *pBC);
+void D3DXDecodeBC6HU(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(16) const uint8_t *pBC);
+void D3DXDecodeBC6HS(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(16) const uint8_t *pBC);
+void D3DXDecodeBC7(_Out_cap_c_(NUM_PIXELS_PER_BLOCK) XMVECTOR *pColor, _In_count_c_(16) const uint8_t *pBC);
+
+void D3DXEncodeBC1(_Out_cap_c_(8) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ float alphaRef, _In_ DWORD flags);
+ // BC1 requires one additional parameter, so it doesn't match signature of BC_ENCODE above
+
+void D3DXEncodeBC2(_Out_cap_c_(16) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags);
+void D3DXEncodeBC3(_Out_cap_c_(16) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags);
+void D3DXEncodeBC4U(_Out_cap_c_(8) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags);
+void D3DXEncodeBC4S(_Out_cap_c_(8) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags);
+void D3DXEncodeBC5U(_Out_cap_c_(16) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags);
+void D3DXEncodeBC5S(_Out_cap_c_(16) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags);
+void D3DXEncodeBC6HU(_Out_cap_c_(16) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags);
+void D3DXEncodeBC6HS(_Out_cap_c_(16) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags);
+void D3DXEncodeBC7(_Out_cap_c_(16) uint8_t *pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags);
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/BC4BC5.cpp b/thirdparty/directxtex/DirectXTex/BC4BC5.cpp
new file mode 100644
index 00000000..0320ee2d
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/BC4BC5.cpp
@@ -0,0 +1,534 @@
+//-------------------------------------------------------------------------------------
+// BC4BC5.cpp
+//
+// Block-compression (BC) functionality for BC4 and BC5 (DirectX 10 texture compression)
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+#include "BC.h"
+
+#pragma warning(disable : 4201)
+
+namespace DirectX
+{
+
+//------------------------------------------------------------------------------------
+// Constants
+//------------------------------------------------------------------------------------
+
+// Because these are used in SAL annotations, they need to remain macros rather than const values
+#define BLOCK_LEN 4
+ // length of each block in texel
+
+#define BLOCK_SIZE (BLOCK_LEN * BLOCK_LEN)
+ // total texels in a 4x4 block.
+
+//------------------------------------------------------------------------------------
+// Structures
+//-------------------------------------------------------------------------------------
+
+// BC4U/BC5U
+struct BC4_UNORM
+{
+ float R(size_t uOffset) const
+ {
+ size_t uIndex = GetIndex(uOffset);
+ return DecodeFromIndex(uIndex);
+ }
+
+ float DecodeFromIndex(size_t uIndex) const
+ {
+ if (uIndex == 0)
+ return red_0 / 255.0f;
+ if (uIndex == 1)
+ return red_1 / 255.0f;
+ float fred_0 = red_0 / 255.0f;
+ float fred_1 = red_1 / 255.0f;
+ if (red_0 > red_1)
+ {
+ uIndex -= 1;
+ return (fred_0 * (7-uIndex) + fred_1 * uIndex) / 7.0f;
+ }
+ else
+ {
+ if (uIndex == 6)
+ return 0.0f;
+ if (uIndex == 7)
+ return 1.0f;
+ uIndex -= 1;
+ return (fred_0 * (5-uIndex) + fred_1 * uIndex) / 5.0f;
+ }
+ }
+
+ size_t GetIndex(size_t uOffset) const
+ {
+ return (size_t) ((data >> (3*uOffset + 16)) & 0x07);
+ }
+
+ void SetIndex(size_t uOffset, size_t uIndex)
+ {
+ data &= ~((uint64_t) 0x07 << (3*uOffset + 16));
+ data |= ((uint64_t) uIndex << (3*uOffset + 16));
+ }
+
+ union
+ {
+ struct
+ {
+ uint8_t red_0;
+ uint8_t red_1;
+ uint8_t indices[6];
+ };
+ uint64_t data;
+ };
+};
+
+// BC4S/BC5S
+struct BC4_SNORM
+{
+ float R(size_t uOffset) const
+ {
+ size_t uIndex = GetIndex(uOffset);
+ return DecodeFromIndex(uIndex);
+ }
+
+ float DecodeFromIndex(size_t uIndex) const
+ {
+ int8_t sred_0 = (red_0 == -128)? -127 : red_0;
+ int8_t sred_1 = (red_1 == -128)? -127 : red_1;
+
+ if (uIndex == 0)
+ return sred_0 / 127.0f;
+ if (uIndex == 1)
+ return sred_1 / 127.0f;
+ float fred_0 = sred_0 / 127.0f;
+ float fred_1 = sred_1 / 127.0f;
+ if (red_0 > red_1)
+ {
+ uIndex -= 1;
+ return (fred_0 * (7-uIndex) + fred_1 * uIndex) / 7.0f;
+ }
+ else
+ {
+ if (uIndex == 6)
+ return -1.0f;
+ if (uIndex == 7)
+ return 1.0f;
+ uIndex -= 1;
+ return (fred_0 * (5-uIndex) + fred_1 * uIndex) / 5.0f;
+ }
+ }
+
+ size_t GetIndex(size_t uOffset) const
+ {
+ return (size_t) ((data >> (3*uOffset + 16)) & 0x07);
+ }
+
+ void SetIndex(size_t uOffset, size_t uIndex)
+ {
+ data &= ~((uint64_t) 0x07 << (3*uOffset + 16));
+ data |= ((uint64_t) uIndex << (3*uOffset + 16));
+ }
+
+ union
+ {
+ struct
+ {
+ int8_t red_0;
+ int8_t red_1;
+ uint8_t indices[6];
+ };
+ uint64_t data;
+ };
+};
+
+
+//-------------------------------------------------------------------------------------
+// Convert a floating point value to an 8-bit SNORM
+//-------------------------------------------------------------------------------------
+static void inline FloatToSNorm( _In_ float fVal, _Out_ int8_t *piSNorm )
+{
+ const uint32_t dwMostNeg = ( 1 << ( 8 * sizeof( int8_t ) - 1 ) );
+
+ if( _isnan( fVal ) )
+ fVal = 0;
+ else
+ if( fVal > 1 )
+ fVal = 1; // Clamp to 1
+ else
+ if( fVal < -1 )
+ fVal = -1; // Clamp to -1
+
+ fVal = fVal * (int8_t) ( dwMostNeg - 1 );
+
+ if( fVal >= 0 )
+ fVal += .5f;
+ else
+ fVal -= .5f;
+
+ *piSNorm = (int8_t) (fVal);
+}
+
+
+//------------------------------------------------------------------------------
+static void FindEndPointsBC4U( _In_count_c_(BLOCK_SIZE) const float theTexelsU[], _Out_ uint8_t &endpointU_0, _Out_ uint8_t &endpointU_1)
+{
+ // The boundary of codec for signed/unsigned format
+ float MIN_NORM;
+ float MAX_NORM = 1.0f;
+ int8_t iStart, iEnd;
+ size_t i;
+
+ MIN_NORM = 0.0f;
+
+ // Find max/min of input texels
+ float fBlockMax = theTexelsU[0];
+ float fBlockMin = theTexelsU[0];
+ for (i = 0; i < BLOCK_SIZE; ++i)
+ {
+ if (theTexelsU[i]<fBlockMin)
+ {
+ fBlockMin = theTexelsU[i];
+ }
+ else if (theTexelsU[i]>fBlockMax)
+ {
+ fBlockMax = theTexelsU[i];
+ }
+ }
+
+ // If there are boundary values in input texels, Should use 4 block-codec to guarantee
+ // the exact code of the boundary values.
+ bool bUsing4BlockCodec = ( MIN_NORM == fBlockMin || MAX_NORM == fBlockMax );
+
+ // Using Optimize
+ float fStart, fEnd;
+
+ if (!bUsing4BlockCodec)
+ {
+ OptimizeAlpha<false>(&fStart, &fEnd, theTexelsU, 8);
+
+ iStart = (uint8_t) (fStart * 255.0f);
+ iEnd = (uint8_t) (fEnd * 255.0f);
+
+ endpointU_0 = iEnd;
+ endpointU_1 = iStart;
+ }
+ else
+ {
+ OptimizeAlpha<false>(&fStart, &fEnd, theTexelsU, 6);
+
+ iStart = (uint8_t) (fStart * 255.0f);
+ iEnd = (uint8_t) (fEnd * 255.0f);
+
+ endpointU_1 = iEnd;
+ endpointU_0 = iStart;
+ }
+}
+
+static void FindEndPointsBC4S(_In_count_c_(BLOCK_SIZE) const float theTexelsU[], _Out_ int8_t &endpointU_0, _Out_ int8_t &endpointU_1)
+{
+ // The boundary of codec for signed/unsigned format
+ float MIN_NORM;
+ float MAX_NORM = 1.0f;
+ int8_t iStart, iEnd;
+ size_t i;
+
+ MIN_NORM = -1.0f;
+
+ // Find max/min of input texels
+ float fBlockMax = theTexelsU[0];
+ float fBlockMin = theTexelsU[0];
+ for (i = 0; i < BLOCK_SIZE; ++i)
+ {
+ if (theTexelsU[i]<fBlockMin)
+ {
+ fBlockMin = theTexelsU[i];
+ }
+ else if (theTexelsU[i]>fBlockMax)
+ {
+ fBlockMax = theTexelsU[i];
+ }
+ }
+
+ // If there are boundary values in input texels, Should use 4 block-codec to guarantee
+ // the exact code of the boundary values.
+ bool bUsing4BlockCodec = ( MIN_NORM == fBlockMin || MAX_NORM == fBlockMax );
+
+ // Using Optimize
+ float fStart, fEnd;
+
+ if (!bUsing4BlockCodec)
+ {
+ OptimizeAlpha<true>(&fStart, &fEnd, theTexelsU, 8);
+
+ FloatToSNorm(fStart, &iStart);
+ FloatToSNorm(fEnd, &iEnd);
+
+ endpointU_0 = iEnd;
+ endpointU_1 = iStart;
+ }
+ else
+ {
+ OptimizeAlpha<true>(&fStart, &fEnd, theTexelsU, 6);
+
+ FloatToSNorm(fStart, &iStart);
+ FloatToSNorm(fEnd, &iEnd);
+
+ endpointU_1 = iEnd;
+ endpointU_0 = iStart;
+ }
+}
+
+
+//------------------------------------------------------------------------------
+static inline void FindEndPointsBC5U( _In_count_c_(BLOCK_SIZE) const float theTexelsU[], _In_count_c_(BLOCK_SIZE) const float theTexelsV[],
+ _Out_ uint8_t &endpointU_0, _Out_ uint8_t &endpointU_1, _Out_ uint8_t &endpointV_0, _Out_ uint8_t &endpointV_1)
+{
+ //Encoding the U and V channel by BC4 codec separately.
+ FindEndPointsBC4U( theTexelsU, endpointU_0, endpointU_1);
+ FindEndPointsBC4U( theTexelsV, endpointV_0, endpointV_1);
+}
+
+static inline void FindEndPointsBC5S( _In_count_c_(BLOCK_SIZE) const float theTexelsU[], _In_count_c_(BLOCK_SIZE) const float theTexelsV[],
+ _Out_ int8_t &endpointU_0, _Out_ int8_t &endpointU_1, _Out_ int8_t &endpointV_0, _Out_ int8_t &endpointV_1)
+{
+ //Encoding the U and V channel by BC4 codec separately.
+ FindEndPointsBC4S( theTexelsU, endpointU_0, endpointU_1);
+ FindEndPointsBC4S( theTexelsV, endpointV_0, endpointV_1);
+}
+
+
+//------------------------------------------------------------------------------
+static void FindClosestUNORM(_Inout_ BC4_UNORM* pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const float theTexelsU[])
+{
+ float rGradient[8];
+ int i;
+ for (i = 0; i < 8; ++i)
+ {
+ rGradient[i] = pBC->DecodeFromIndex(i);
+ }
+ for (i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ size_t uBestIndex = 0;
+ float fBestDelta = 100000;
+ for (size_t uIndex = 0; uIndex < 8; uIndex++)
+ {
+ float fCurrentDelta = fabsf(rGradient[uIndex]-theTexelsU[i]);
+ if (fCurrentDelta < fBestDelta)
+ {
+ uBestIndex = uIndex;
+ fBestDelta = fCurrentDelta;
+ }
+ }
+ pBC->SetIndex(i, uBestIndex);
+ }
+}
+
+static void FindClosestSNORM(_Inout_ BC4_SNORM* pBC, _In_count_c_(NUM_PIXELS_PER_BLOCK) const float theTexelsU[])
+{
+ float rGradient[8];
+ int i;
+ for (i = 0; i < 8; ++i)
+ {
+ rGradient[i] = pBC->DecodeFromIndex(i);
+ }
+ for (i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ size_t uBestIndex = 0;
+ float fBestDelta = 100000;
+ for (size_t uIndex = 0; uIndex < 8; uIndex++)
+ {
+ float fCurrentDelta = fabsf(rGradient[uIndex]-theTexelsU[i]);
+ if (fCurrentDelta < fBestDelta)
+ {
+ uBestIndex = uIndex;
+ fBestDelta = fCurrentDelta;
+ }
+ }
+ pBC->SetIndex(i, uBestIndex);
+ }
+}
+
+
+//=====================================================================================
+// Entry points
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// BC4 Compression
+//-------------------------------------------------------------------------------------
+void D3DXDecodeBC4U( XMVECTOR *pColor, const uint8_t *pBC )
+{
+ assert( pColor && pBC );
+ static_assert( sizeof(BC4_UNORM) == 8, "BC4_UNORM should be 8 bytes" );
+
+ const BC4_UNORM * pBC4 = reinterpret_cast<const BC4_UNORM*>(pBC);
+
+ for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ pColor[i] = XMVectorSet( pBC4->R(i), 0, 0, 1.0f);
+ }
+}
+
+void D3DXDecodeBC4S(XMVECTOR *pColor, const uint8_t *pBC)
+{
+ assert( pColor && pBC );
+ static_assert( sizeof(BC4_SNORM) == 8, "BC4_SNORM should be 8 bytes" );
+
+ const BC4_SNORM * pBC4 = reinterpret_cast<const BC4_SNORM*>(pBC);
+
+ for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ pColor[i] = XMVectorSet( pBC4->R(i), 0, 0, 1.0f);
+ }
+}
+
+void D3DXEncodeBC4U( uint8_t *pBC, const XMVECTOR *pColor, DWORD flags )
+{
+ UNREFERENCED_PARAMETER( flags );
+
+ assert( pBC && pColor );
+ static_assert( sizeof(BC4_UNORM) == 8, "BC4_UNORM should be 8 bytes" );
+
+ memset(pBC, 0, sizeof(BC4_UNORM));
+ BC4_UNORM * pBC4 = reinterpret_cast<BC4_UNORM*>(pBC);
+ float theTexelsU[NUM_PIXELS_PER_BLOCK];
+
+ for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ theTexelsU[i] = XMVectorGetX( pColor[i] );
+ }
+
+ FindEndPointsBC4U(theTexelsU, pBC4->red_0, pBC4->red_1);
+ FindClosestUNORM(pBC4, theTexelsU);
+}
+
+void D3DXEncodeBC4S( uint8_t *pBC, const XMVECTOR *pColor, DWORD flags )
+{
+ UNREFERENCED_PARAMETER( flags );
+
+ assert( pBC && pColor );
+ static_assert( sizeof(BC4_SNORM) == 8, "BC4_SNORM should be 8 bytes" );
+
+ memset(pBC, 0, sizeof(BC4_UNORM));
+ BC4_SNORM * pBC4 = reinterpret_cast<BC4_SNORM*>(pBC);
+ float theTexelsU[NUM_PIXELS_PER_BLOCK];
+
+ for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ theTexelsU[i] = XMVectorGetX( pColor[i] );
+ }
+
+ FindEndPointsBC4S(theTexelsU, pBC4->red_0, pBC4->red_1);
+ FindClosestSNORM(pBC4, theTexelsU);
+}
+
+
+//-------------------------------------------------------------------------------------
+// BC5 Compression
+//-------------------------------------------------------------------------------------
+void D3DXDecodeBC5U(XMVECTOR *pColor, const uint8_t *pBC)
+{
+ assert( pColor && pBC );
+ static_assert( sizeof(BC4_UNORM) == 8, "BC4_UNORM should be 8 bytes" );
+
+ const BC4_UNORM * pBCR = reinterpret_cast<const BC4_UNORM*>(pBC);
+ const BC4_UNORM * pBCG = reinterpret_cast<const BC4_UNORM*>(pBC+sizeof(BC4_UNORM));
+
+ for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ pColor[i] = XMVectorSet(pBCR->R(i), pBCG->R(i), 0, 1.0f);
+ }
+}
+
+void D3DXDecodeBC5S(XMVECTOR *pColor, const uint8_t *pBC)
+{
+ assert( pColor && pBC );
+ static_assert( sizeof(BC4_SNORM) == 8, "BC4_SNORM should be 8 bytes" );
+
+ const BC4_SNORM * pBCR = reinterpret_cast<const BC4_SNORM*>(pBC);
+ const BC4_SNORM * pBCG = reinterpret_cast<const BC4_SNORM*>(pBC+sizeof(BC4_SNORM));
+
+ for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ pColor[i] = XMVectorSet(pBCR->R(i), pBCG->R(i), 0, 1.0f);
+ }
+}
+
+void D3DXEncodeBC5U( uint8_t *pBC, const XMVECTOR *pColor, DWORD flags )
+{
+ UNREFERENCED_PARAMETER( flags );
+
+ assert( pBC && pColor );
+ static_assert( sizeof(BC4_UNORM) == 8, "BC4_UNORM should be 8 bytes" );
+
+ memset(pBC, 0, sizeof(BC4_UNORM)*2);
+ BC4_UNORM * pBCR = reinterpret_cast<BC4_UNORM*>(pBC);
+ BC4_UNORM * pBCG = reinterpret_cast<BC4_UNORM*>(pBC+sizeof(BC4_UNORM));
+ float theTexelsU[NUM_PIXELS_PER_BLOCK];
+ float theTexelsV[NUM_PIXELS_PER_BLOCK];
+
+ for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ XMFLOAT4A clr;
+ XMStoreFloat4A( &clr, pColor[i] );
+ theTexelsU[i] = clr.x;
+ theTexelsV[i] = clr.y;
+ }
+
+ FindEndPointsBC5U(
+ theTexelsU,
+ theTexelsV,
+ pBCR->red_0,
+ pBCR->red_1,
+ pBCG->red_0,
+ pBCG->red_1);
+
+ FindClosestUNORM(pBCR, theTexelsU);
+ FindClosestUNORM(pBCG, theTexelsV);
+}
+
+void D3DXEncodeBC5S( uint8_t *pBC, const XMVECTOR *pColor, DWORD flags )
+{
+ UNREFERENCED_PARAMETER( flags );
+
+ assert( pBC && pColor );
+ static_assert( sizeof(BC4_SNORM) == 8, "BC4_SNORM should be 8 bytes" );
+
+ memset(pBC, 0, sizeof(BC4_UNORM)*2);
+ BC4_SNORM * pBCR = reinterpret_cast<BC4_SNORM*>(pBC);
+ BC4_SNORM * pBCG = reinterpret_cast<BC4_SNORM*>(pBC+sizeof(BC4_SNORM));
+ float theTexelsU[NUM_PIXELS_PER_BLOCK];
+ float theTexelsV[NUM_PIXELS_PER_BLOCK];
+
+ for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ XMFLOAT4A clr;
+ XMStoreFloat4A( &clr, pColor[i] );
+ theTexelsU[i] = clr.x;
+ theTexelsV[i] = clr.y;
+ }
+
+ FindEndPointsBC5S(
+ theTexelsU,
+ theTexelsV,
+ pBCR->red_0,
+ pBCR->red_1,
+ pBCG->red_0,
+ pBCG->red_1);
+
+ FindClosestSNORM(pBCR, theTexelsU);
+ FindClosestSNORM(pBCG, theTexelsV);
+}
+
+} // namespace \ No newline at end of file
diff --git a/thirdparty/directxtex/DirectXTex/BC6HBC7.cpp b/thirdparty/directxtex/DirectXTex/BC6HBC7.cpp
new file mode 100644
index 00000000..2e607543
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/BC6HBC7.cpp
@@ -0,0 +1,2822 @@
+//-------------------------------------------------------------------------------------
+// BC6HBC7.cpp
+//
+// Block-compression (BC) functionality for BC6H and BC7 (DirectX 11 texture compression)
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+#include "BC.h"
+
+#ifndef USE_XNAMATH
+using namespace DirectX::PackedVector;
+#endif
+
+namespace DirectX
+{
+
+//-------------------------------------------------------------------------------------
+// Constants
+//-------------------------------------------------------------------------------------
+
+static const float fEpsilon = (0.25f / 64.0f) * (0.25f / 64.0f);
+static const float pC3[] = { 2.0f/2.0f, 1.0f/2.0f, 0.0f/2.0f };
+static const float pD3[] = { 0.0f/2.0f, 1.0f/2.0f, 2.0f/2.0f };
+static const float pC4[] = { 3.0f/3.0f, 2.0f/3.0f, 1.0f/3.0f, 0.0f/3.0f };
+static const float pD4[] = { 0.0f/3.0f, 1.0f/3.0f, 2.0f/3.0f, 3.0f/3.0f };
+
+const int g_aWeights2[] = {0, 21, 43, 64};
+const int g_aWeights3[] = {0, 9, 18, 27, 37, 46, 55, 64};
+const int g_aWeights4[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};
+
+// Partition, Shape, Pixel (index into 4x4 block)
+static const uint8_t g_aPartitionTable[3][64][16] =
+{
+ { // 1 Region case has no subsets (all 0)
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
+ },
+
+ { // BC6H/BC7 Partition Set for 2 Subsets
+ { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, // Shape 0
+ { 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1 }, // Shape 1
+ { 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1 }, // Shape 2
+ { 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, // Shape 3
+ { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1 }, // Shape 4
+ { 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // Shape 5
+ { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // Shape 6
+ { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, // Shape 7
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1 }, // Shape 8
+ { 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // Shape 9
+ { 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // Shape 10
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1 }, // Shape 11
+ { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // Shape 12
+ { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, // Shape 13
+ { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // Shape 14
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1 }, // Shape 15
+ { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1 }, // Shape 16
+ { 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, // Shape 17
+ { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0 }, // Shape 18
+ { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, // Shape 19
+ { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, // Shape 20
+ { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0 }, // Shape 21
+ { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, // Shape 22
+ { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1 }, // Shape 23
+ { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, // Shape 24
+ { 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, // Shape 25
+ { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0 }, // Shape 26
+ { 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0 }, // Shape 27
+ { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0 }, // Shape 28
+ { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, // Shape 29
+ { 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0 }, // Shape 30
+ { 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, // Shape 31
+
+ // BC7 Partition Set for 2 Subsets (second-half)
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, // Shape 32
+ { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1 }, // Shape 33
+ { 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0 }, // Shape 34
+ { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0 }, // Shape 35
+ { 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0 }, // Shape 36
+ { 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0 }, // Shape 37
+ { 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1 }, // Shape 38
+ { 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1 }, // Shape 39
+ { 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0 }, // Shape 40
+ { 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0 }, // Shape 41
+ { 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0 }, // Shape 42
+ { 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0 }, // Shape 43
+ { 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0 }, // Shape 44
+ { 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1 }, // Shape 45
+ { 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1 }, // Shape 46
+ { 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 }, // Shape 47
+ { 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, // Shape 48
+ { 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0 }, // Shape 49
+ { 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0 }, // Shape 50
+ { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0 }, // Shape 51
+ { 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1 }, // Shape 52
+ { 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, // Shape 53
+ { 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, // Shape 54
+ { 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0 }, // Shape 55
+ { 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, // Shape 56
+ { 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1 }, // Shape 57
+ { 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1 }, // Shape 58
+ { 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1 }, // Shape 59
+ { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, // Shape 60
+ { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, // Shape 61
+ { 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0 }, // Shape 62
+ { 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1 } // Shape 63
+ },
+
+ { // BC7 Partition Set for 3 Subsets
+ { 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 1, 2, 2, 2, 2 }, // Shape 0
+ { 0, 0, 0, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1 }, // Shape 1
+ { 0, 0, 0, 0, 2, 0, 0, 1, 2, 2, 1, 1, 2, 2, 1, 1 }, // Shape 2
+ { 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1 }, // Shape 3
+ { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2 }, // Shape 4
+ { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2 }, // Shape 5
+ { 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1 }, // Shape 6
+ { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1 }, // Shape 7
+ { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2 }, // Shape 8
+ { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2 }, // Shape 9
+ { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 }, // Shape 10
+ { 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2 }, // Shape 11
+ { 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2 }, // Shape 12
+ { 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2 }, // Shape 13
+ { 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2 }, // Shape 14
+ { 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0 }, // Shape 15
+ { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2 }, // Shape 16
+ { 0, 1, 1, 1, 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0 }, // Shape 17
+ { 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2 }, // Shape 18
+ { 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1 }, // Shape 19
+ { 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2 }, // Shape 20
+ { 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 2, 1, 2, 2, 2, 1 }, // Shape 21
+ { 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2 }, // Shape 22
+ { 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 1, 0 }, // Shape 23
+ { 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0 }, // Shape 24
+ { 0, 0, 1, 2, 0, 0, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2 }, // Shape 25
+ { 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1, 0, 1, 1, 0 }, // Shape 26
+ { 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1 }, // Shape 27
+ { 0, 0, 2, 2, 1, 1, 0, 2, 1, 1, 0, 2, 0, 0, 2, 2 }, // Shape 28
+ { 0, 1, 1, 0, 0, 1, 1, 0, 2, 0, 0, 2, 2, 2, 2, 2 }, // Shape 29
+ { 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1 }, // Shape 30
+ { 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 2, 2, 1 }, // Shape 31
+ { 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 2, 2, 2 }, // Shape 32
+ { 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2, 0, 0, 1, 1 }, // Shape 33
+ { 0, 0, 1, 1, 0, 0, 1, 2, 0, 0, 2, 2, 0, 2, 2, 2 }, // Shape 34
+ { 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0 }, // Shape 35
+ { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0 }, // Shape 36
+ { 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0 }, // Shape 37
+ { 0, 1, 2, 0, 2, 0, 1, 2, 1, 2, 0, 1, 0, 1, 2, 0 }, // Shape 38
+ { 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1 }, // Shape 39
+ { 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1 }, // Shape 40
+ { 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2 }, // Shape 41
+ { 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1 }, // Shape 42
+ { 0, 0, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2, 1, 1, 2, 2 }, // Shape 43
+ { 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 1, 1 }, // Shape 44
+ { 0, 2, 2, 0, 1, 2, 2, 1, 0, 2, 2, 0, 1, 2, 2, 1 }, // Shape 45
+ { 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 0, 1 }, // Shape 46
+ { 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1 }, // Shape 47
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2 }, // Shape 48
+ { 0, 2, 2, 2, 0, 1, 1, 1, 0, 2, 2, 2, 0, 1, 1, 1 }, // Shape 49
+ { 0, 0, 0, 2, 1, 1, 1, 2, 0, 0, 0, 2, 1, 1, 1, 2 }, // Shape 50
+ { 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2 }, // Shape 51
+ { 0, 2, 2, 2, 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2 }, // Shape 52
+ { 0, 0, 0, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2 }, // Shape 53
+ { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2 }, // Shape 54
+ { 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2 }, // Shape 55
+ { 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2 }, // Shape 56
+ { 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2 }, // Shape 57
+ { 0, 0, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2 }, // Shape 58
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2 }, // Shape 59
+ { 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1 }, // Shape 60
+ { 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2 }, // Shape 61
+ { 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, // Shape 62
+ { 0, 1, 1, 1, 2, 0, 1, 1, 2, 2, 0, 1, 2, 2, 2, 0 } // Shape 63
+ }
+};
+
+// Partition, Shape, Fixup
+static const uint8_t g_aFixUp[3][64][3] =
+{
+ { // No fix-ups for 1st subset for BC6H or BC7
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0},
+ { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}, { 0, 0, 0}
+ },
+
+ { // BC6H/BC7 Partition Set Fixups for 2 Subsets
+ { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, { 0,15, 0},
+ { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, { 0,15, 0},
+ { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, { 0,15, 0},
+ { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, { 0,15, 0},
+ { 0,15, 0}, { 0, 2, 0}, { 0, 8, 0}, { 0, 2, 0},
+ { 0, 2, 0}, { 0, 8, 0}, { 0, 8, 0}, { 0,15, 0},
+ { 0, 2, 0}, { 0, 8, 0}, { 0, 2, 0}, { 0, 2, 0},
+ { 0, 8, 0}, { 0, 8, 0}, { 0, 2, 0}, { 0, 2, 0},
+
+ // BC7 Partition Set Fixups for 2 Subsets (second-half)
+ { 0,15, 0}, { 0,15, 0}, { 0, 6, 0}, { 0, 8, 0},
+ { 0, 2, 0}, { 0, 8, 0}, { 0,15, 0}, { 0,15, 0},
+ { 0, 2, 0}, { 0, 8, 0}, { 0, 2, 0}, { 0, 2, 0},
+ { 0, 2, 0}, { 0,15, 0}, { 0,15, 0}, { 0, 6, 0},
+ { 0, 6, 0}, { 0, 2, 0}, { 0, 6, 0}, { 0, 8, 0},
+ { 0,15, 0}, { 0,15, 0}, { 0, 2, 0}, { 0, 2, 0},
+ { 0,15, 0}, { 0,15, 0}, { 0,15, 0}, { 0,15, 0},
+ { 0,15, 0}, { 0, 2, 0}, { 0, 2, 0}, { 0,15, 0}
+ },
+
+ { // BC7 Partition Set Fixups for 3 Subsets
+ { 0, 3,15}, { 0, 3, 8}, { 0,15, 8}, { 0,15, 3},
+ { 0, 8,15}, { 0, 3,15}, { 0,15, 3}, { 0,15, 8},
+ { 0, 8,15}, { 0, 8,15}, { 0, 6,15}, { 0, 6,15},
+ { 0, 6,15}, { 0, 5,15}, { 0, 3,15}, { 0, 3, 8},
+ { 0, 3,15}, { 0, 3, 8}, { 0, 8,15}, { 0,15, 3},
+ { 0, 3,15}, { 0, 3, 8}, { 0, 6,15}, { 0,10, 8},
+ { 0, 5, 3}, { 0, 8,15}, { 0, 8, 6}, { 0, 6,10},
+ { 0, 8,15}, { 0, 5,15}, { 0,15,10}, { 0,15, 8},
+ { 0, 8,15}, { 0,15, 3}, { 0, 3,15}, { 0, 5,10},
+ { 0, 6,10}, { 0,10, 8}, { 0, 8, 9}, { 0,15,10},
+ { 0,15, 6}, { 0, 3,15}, { 0,15, 8}, { 0, 5,15},
+ { 0,15, 3}, { 0,15, 6}, { 0,15, 6}, { 0,15, 8},
+ { 0, 3,15}, { 0,15, 3}, { 0, 5,15}, { 0, 5,15},
+ { 0, 5,15}, { 0, 8,15}, { 0, 5,15}, { 0,10,15},
+ { 0, 5,15}, { 0,10,15}, { 0, 8,15}, { 0,13,15},
+ { 0,15, 3}, { 0,12,15}, { 0, 3,15}, { 0, 3, 8}
+ }
+};
+
+// BC6H Compression
+const D3DX_BC6H::ModeDescriptor D3DX_BC6H::ms_aDesc[14][82] =
+{
+ { // 0x00 - 10 5 5 5
+ { M, 0}, { M, 1}, {GY, 4}, {BY, 4}, {BZ, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4},
+ {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4},
+ {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4},
+ {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4},
+ {GZ, 4}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4},
+ {BZ, 0}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4},
+ {BZ, 1}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4},
+ {BZ, 2}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {BZ, 3}, { D, 0}, { D, 1}, { D, 2},
+ { D, 3}, { D, 4},
+ },
+
+ { // 0x01 - 7 6 6 6
+ { M, 0}, { M, 1}, {GY, 5}, {GZ, 4}, {GZ, 5}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4},
+ {RW, 5}, {RW, 6}, {BZ, 0}, {BZ, 1}, {BY, 4}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4},
+ {GW, 5}, {GW, 6}, {BY, 5}, {BZ, 2}, {GY, 4}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4},
+ {BW, 5}, {BW, 6}, {BZ, 3}, {BZ, 5}, {BZ, 4}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4},
+ {RX, 5}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4},
+ {GX, 5}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4},
+ {BX, 5}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4},
+ {RY, 5}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {RZ, 5}, { D, 0}, { D, 1}, { D, 2},
+ { D, 3}, { D, 4},
+ },
+
+ { // 0x02 - 11 5 4 4
+ { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4},
+ {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4},
+ {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4},
+ {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4},
+ {RW,10}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GW,10},
+ {BZ, 0}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BW,10},
+ {BZ, 1}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4},
+ {BZ, 2}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {BZ, 3}, { D, 0}, { D, 1}, { D, 2},
+ { D, 3}, { D, 4},
+ },
+
+ { // 0x06 - 11 4 5 4
+ { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4},
+ {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4},
+ {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4},
+ {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RW,10},
+ {GZ, 4}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4},
+ {GW,10}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BW,10},
+ {BZ, 1}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {BZ, 0},
+ {BZ, 2}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {GY, 4}, {BZ, 3}, { D, 0}, { D, 1}, { D, 2},
+ { D, 3}, { D, 4},
+ },
+
+ { // 0x0a - 11 4 4 5
+ { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4},
+ {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4},
+ {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4},
+ {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RW,10},
+ {BY, 4}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GW,10},
+ {BZ, 0}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4},
+ {BW,10}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {BZ, 1},
+ {BZ, 2}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {BZ, 4}, {BZ, 3}, { D, 0}, { D, 1}, { D, 2},
+ { D, 3}, { D, 4},
+ },
+
+ { // 0x0e - 9 5 5 5
+ { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4},
+ {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {BY, 4}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4},
+ {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GY, 4}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4},
+ {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BZ, 4}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4},
+ {GZ, 4}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4},
+ {BZ, 0}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4},
+ {BZ, 1}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4},
+ {BZ, 2}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {BZ, 3}, { D, 0}, { D, 1}, { D, 2},
+ { D, 3}, { D, 4},
+ },
+
+ { // 0x12 - 8 6 5 5
+ { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4},
+ {RW, 5}, {RW, 6}, {RW, 7}, {GZ, 4}, {BY, 4}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4},
+ {GW, 5}, {GW, 6}, {GW, 7}, {BZ, 2}, {GY, 4}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4},
+ {BW, 5}, {BW, 6}, {BW, 7}, {BZ, 3}, {BZ, 4}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4},
+ {RX, 5}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4},
+ {BZ, 0}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4},
+ {BZ, 1}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4},
+ {RY, 5}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {RZ, 5}, { D, 0}, { D, 1}, { D, 2},
+ { D, 3}, { D, 4},
+ },
+
+ { // 0x16 - 8 5 6 5
+ { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4},
+ {RW, 5}, {RW, 6}, {RW, 7}, {BZ, 0}, {BY, 4}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4},
+ {GW, 5}, {GW, 6}, {GW, 7}, {GY, 5}, {GY, 4}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4},
+ {BW, 5}, {BW, 6}, {BW, 7}, {GZ, 5}, {BZ, 4}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4},
+ {GZ, 4}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4},
+ {GX, 5}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4},
+ {BZ, 1}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4},
+ {BZ, 2}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {BZ, 3}, { D, 0}, { D, 1}, { D, 2},
+ { D, 3}, { D, 4},
+ },
+
+ { // 0x1a - 8 5 5 6
+ { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4},
+ {RW, 5}, {RW, 6}, {RW, 7}, {BZ, 1}, {BY, 4}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4},
+ {GW, 5}, {GW, 6}, {GW, 7}, {BY, 5}, {GY, 4}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4},
+ {BW, 5}, {BW, 6}, {BW, 7}, {BZ, 5}, {BZ, 4}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4},
+ {GZ, 4}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4},
+ {BZ, 0}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4},
+ {BX, 5}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4},
+ {BZ, 2}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {BZ, 3}, { D, 0}, { D, 1}, { D, 2},
+ { D, 3}, { D, 4},
+ },
+
+ { // 0x1e - 6 6 6 6
+ { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4},
+ {RW, 5}, {GZ, 4}, {BZ, 0}, {BZ, 1}, {BY, 4}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4},
+ {GW, 5}, {GY, 5}, {BY, 5}, {BZ, 2}, {GY, 4}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4},
+ {BW, 5}, {GZ, 5}, {BZ, 3}, {BZ, 5}, {BZ, 4}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4},
+ {RX, 5}, {GY, 0}, {GY, 1}, {GY, 2}, {GY, 3}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4},
+ {GX, 5}, {GZ, 0}, {GZ, 1}, {GZ, 2}, {GZ, 3}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4},
+ {BX, 5}, {BY, 0}, {BY, 1}, {BY, 2}, {BY, 3}, {RY, 0}, {RY, 1}, {RY, 2}, {RY, 3}, {RY, 4},
+ {RY, 5}, {RZ, 0}, {RZ, 1}, {RZ, 2}, {RZ, 3}, {RZ, 4}, {RZ, 5}, { D, 0}, { D, 1}, { D, 2},
+ { D, 3}, { D, 4},
+ },
+
+ { // 0x03 - 10 10
+ { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4},
+ {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4},
+ {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4},
+ {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4},
+ {RX, 5}, {RX, 6}, {RX, 7}, {RX, 8}, {RX, 9}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4},
+ {GX, 5}, {GX, 6}, {GX, 7}, {GX, 8}, {GX, 9}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4},
+ {BX, 5}, {BX, 6}, {BX, 7}, {BX, 8}, {BX, 9}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0},
+ {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0},
+ {NA, 0}, {NA, 0},
+ },
+
+ { // 0x07 - 11 9
+ { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4},
+ {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4},
+ {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4},
+ {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4},
+ {RX, 5}, {RX, 6}, {RX, 7}, {RX, 8}, {RW,10}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4},
+ {GX, 5}, {GX, 6}, {GX, 7}, {GX, 8}, {GW,10}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4},
+ {BX, 5}, {BX, 6}, {BX, 7}, {BX, 8}, {BW,10}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0},
+ {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0},
+ {NA, 0}, {NA, 0},
+ },
+
+ { // 0x0b - 12 8
+ { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4},
+ {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4},
+ {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4},
+ {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RX, 4},
+ {RX, 5}, {RX, 6}, {RX, 7}, {RW,11}, {RW,10}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GX, 4},
+ {GX, 5}, {GX, 6}, {GX, 7}, {GW,11}, {GW,10}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BX, 4},
+ {BX, 5}, {BX, 6}, {BX, 7}, {BW,11}, {BW,10}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0},
+ {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0},
+ {NA, 0}, {NA, 0},
+ },
+
+ { // 0x0f - 16 4
+ { M, 0}, { M, 1}, { M, 2}, { M, 3}, { M, 4}, {RW, 0}, {RW, 1}, {RW, 2}, {RW, 3}, {RW, 4},
+ {RW, 5}, {RW, 6}, {RW, 7}, {RW, 8}, {RW, 9}, {GW, 0}, {GW, 1}, {GW, 2}, {GW, 3}, {GW, 4},
+ {GW, 5}, {GW, 6}, {GW, 7}, {GW, 8}, {GW, 9}, {BW, 0}, {BW, 1}, {BW, 2}, {BW, 3}, {BW, 4},
+ {BW, 5}, {BW, 6}, {BW, 7}, {BW, 8}, {BW, 9}, {RX, 0}, {RX, 1}, {RX, 2}, {RX, 3}, {RW,15},
+ {RW,14}, {RW,13}, {RW,12}, {RW,11}, {RW,10}, {GX, 0}, {GX, 1}, {GX, 2}, {GX, 3}, {GW,15},
+ {GW,14}, {GW,13}, {GW,12}, {GW,11}, {GW,10}, {BX, 0}, {BX, 1}, {BX, 2}, {BX, 3}, {BW,15},
+ {BW,14}, {BW,13}, {BW,12}, {BW,11}, {BW,10}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0},
+ {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0}, {NA, 0},
+ {NA, 0}, {NA, 0},
+ },
+};
+
+// Mode, Partitions, Transformed, IndexPrec, RGBAPrec
+const D3DX_BC6H::ModeInfo D3DX_BC6H::ms_aInfo[] =
+{
+ {0x00, 1, true, 3, LDRColorA(10,10,10,0), LDRColorA( 5, 5, 5,0), LDRColorA(5,5,5,0), LDRColorA(5,5,5,0)}, // Mode 0
+ {0x01, 1, true, 3, LDRColorA( 7, 7, 7,0), LDRColorA( 6, 6, 6,0), LDRColorA(6,6,6,0), LDRColorA(6,6,6,0)}, // Mode 1
+ {0x02, 1, true, 3, LDRColorA(11,11,11,0), LDRColorA( 5, 4, 4,0), LDRColorA(5,4,4,0), LDRColorA(5,4,4,0)}, // Mode 2
+ {0x06, 1, true, 3, LDRColorA(11,11,11,0), LDRColorA( 4, 5, 4,0), LDRColorA(4,5,4,0), LDRColorA(4,5,4,0)}, // Mode 3
+ {0x0a, 1, true, 3, LDRColorA(11,11,11,0), LDRColorA( 4, 4, 5,0), LDRColorA(4,4,5,0), LDRColorA(4,4,5,0)}, // Mode 4
+ {0x0e, 1, true, 3, LDRColorA( 9, 9, 9,0), LDRColorA( 5, 5, 5,0), LDRColorA(5,5,5,0), LDRColorA(5,5,5,0)}, // Mode 5
+ {0x12, 1, true, 3, LDRColorA( 8, 8, 8,0), LDRColorA( 6, 5, 5,0), LDRColorA(6,5,5,0), LDRColorA(6,5,5,0)}, // Mode 6
+ {0x16, 1, true, 3, LDRColorA( 8, 8, 8,0), LDRColorA( 5, 6, 5,0), LDRColorA(5,6,5,0), LDRColorA(5,6,5,0)}, // Mode 7
+ {0x1a, 1, true, 3, LDRColorA( 8, 8, 8,0), LDRColorA( 5, 5, 6,0), LDRColorA(5,5,6,0), LDRColorA(5,5,6,0)}, // Mode 8
+ {0x1e, 1, false, 3, LDRColorA( 6, 6, 6,0), LDRColorA( 6, 6, 6,0), LDRColorA(6,6,6,0), LDRColorA(6,6,6,0)}, // Mode 9
+ {0x03, 0, false, 4, LDRColorA(10,10,10,0), LDRColorA(10,10,10,0), LDRColorA(0,0,0,0), LDRColorA(0,0,0,0)}, // Mode 10
+ {0x07, 0, true, 4, LDRColorA(11,11,11,0), LDRColorA( 9, 9, 9,0), LDRColorA(0,0,0,0), LDRColorA(0,0,0,0)}, // Mode 11
+ {0x0b, 0, true, 4, LDRColorA(12,12,12,0), LDRColorA( 8, 8, 8,0), LDRColorA(0,0,0,0), LDRColorA(0,0,0,0)}, // Mode 12
+ {0x0f, 0, true, 4, LDRColorA(16,16,16,0), LDRColorA( 4, 4, 4,0), LDRColorA(0,0,0,0), LDRColorA(0,0,0,0)}, // Mode 13
+};
+
+const int D3DX_BC6H::ms_aModeToInfo[] =
+{
+ 0, // 0x00
+ 1, // 0x01
+ 2, // 0x02
+ 10, // 0x03
+ -1, // 0x04
+ -1, // 0x05
+ 3, // 0x06
+ 11, // 0x07
+ -1, // 0x08
+ -1, // 0x09
+ 4, // 0x0a
+ 12, // 0x0b
+ -1, // 0x0c
+ -1, // 0x0d
+ 5, // 0x0e
+ 13, // 0x0f
+ -1, // 0x10
+ -1, // 0x11
+ 6, // 0x12
+ -1, // 0x13
+ -1, // 0x14
+ -1, // 0x15
+ 7, // 0x16
+ -1, // 0x17
+ -1, // 0x18
+ -1, // 0x19
+ 8, // 0x1a
+ -1, // 0x1b
+ -1, // 0x1c
+ -1, // 0x1d
+ 9, // 0x1e
+ -1, // 0x1f
+};
+
+// BC7 compression: uPartitions, uPartitionBits, uPBits, uRotationBits, uIndexModeBits, uIndexPrec, uIndexPrec2, RGBAPrec, RGBAPrecWithP
+const D3DX_BC7::ModeInfo D3DX_BC7::ms_aInfo[] =
+{
+ {2, 4, 6, 0, 0, 3, 0, LDRColorA(4,4,4,0), LDRColorA(5,5,5,0)},
+ // Mode 0: Color only, 3 Subsets, RGBP 4441 (unique P-bit), 3-bit indecies, 16 partitions
+ {1, 6, 2, 0, 0, 3, 0, LDRColorA(6,6,6,0), LDRColorA(7,7,7,0)},
+ // Mode 1: Color only, 2 Subsets, RGBP 6661 (shared P-bit), 3-bit indecies, 64 partitions
+ {2, 6, 0, 0, 0, 2, 0, LDRColorA(5,5,5,0), LDRColorA(5,5,5,0)},
+ // Mode 2: Color only, 3 Subsets, RGB 555, 2-bit indecies, 64 partitions
+ {1, 6, 4, 0, 0, 2, 0, LDRColorA(7,7,7,0), LDRColorA(8,8,8,0)},
+ // Mode 3: Color only, 2 Subsets, RGBP 7771 (unique P-bit), 2-bits indecies, 64 partitions
+ {0, 0, 0, 2, 1, 2, 3, LDRColorA(5,5,5,6), LDRColorA(5,5,5,6)},
+ // Mode 4: Color w/ Separate Alpha, 1 Subset, RGB 555, A6, 16x2/16x3-bit indices, 2-bit rotation, 1-bit index selector
+ {0, 0, 0, 2, 0, 2, 2, LDRColorA(7,7,7,8), LDRColorA(7,7,7,8)},
+ // Mode 5: Color w/ Separate Alpha, 1 Subset, RGB 777, A8, 16x2/16x2-bit indices, 2-bit rotation
+ {0, 0, 2, 0, 0, 4, 0, LDRColorA(7,7,7,7), LDRColorA(8,8,8,8)},
+ // Mode 6: Color+Alpha, 1 Subset, RGBAP 77771 (unique P-bit), 16x4-bit indecies
+ {1, 6, 4, 0, 0, 2, 0, LDRColorA(5,5,5,5), LDRColorA(6,6,6,6)}
+ // Mode 7: Color+Alpha, 2 Subsets, RGBAP 55551 (unique P-bit), 2-bit indices, 64 partitions
+};
+
+
+//-------------------------------------------------------------------------------------
+// Helper functions
+//-------------------------------------------------------------------------------------
+template< class T >
+inline static void Swap( T& a, T& b )
+{
+ T temp = a;
+ a = b;
+ b = temp;
+}
+
+inline static bool IsFixUpOffset(_In_range_(0,2) size_t uPartitions, _In_range_(0,63) size_t uShape, _In_range_(0,15) size_t uOffset)
+{
+ assert(uPartitions < 3 && uShape < 64 && uOffset < 16);
+ __analysis_assume(uPartitions < 3 && uShape < 64 && uOffset < 16);
+ for(size_t p = 0; p <= uPartitions; p++)
+ {
+ if(uOffset == g_aFixUp[uPartitions][uShape][p])
+ {
+ return true;
+ }
+ }
+ return false;
+}
+
+inline static float ErrorMetricRGB(_In_ const LDRColorA& a, _In_ const LDRColorA& b)
+{
+ float er = float(a.r) - float(b.r);
+ float eg = float(a.g) - float(b.g);
+ float eb = float(a.b) - float(b.b);
+ // weigh the components nonuniformly
+ //er *= 0.299;
+ //eg *= 0.587;
+ //eb *= 0.114;
+ return er*er + eg*eg + eb*eb;
+}
+
+inline static float ErrorMetricAlpha(_In_ const LDRColorA& a, _In_ const LDRColorA& b)
+{
+ float ea = float(a.a) - float(b.a);
+ return ea*ea;
+}
+
+inline static float ErrorMetric(_In_ const LDRColorA& a, _In_ const LDRColorA& b)
+{
+ return ErrorMetricRGB(a, b) + ErrorMetricAlpha(a, b);
+}
+
+inline static void TransformForward(_Inout_count_c_(BC6H_MAX_REGIONS) INTEndPntPair aEndPts[])
+{
+ aEndPts[0].B -= aEndPts[0].A;
+ aEndPts[1].A -= aEndPts[0].A;
+ aEndPts[1].B -= aEndPts[0].A;
+}
+
+inline static void TransformInverse(_Inout_count_c_(BC6H_MAX_REGIONS) INTEndPntPair aEndPts[], _In_ const LDRColorA& Prec, _In_ bool bSigned)
+{
+ INTColor WrapMask((1 << Prec.r) - 1, (1 << Prec.g) - 1, (1 << Prec.b) - 1);
+ aEndPts[0].B += aEndPts[0].A; aEndPts[0].B &= WrapMask;
+ aEndPts[1].A += aEndPts[0].A; aEndPts[1].A &= WrapMask;
+ aEndPts[1].B += aEndPts[0].A; aEndPts[1].B &= WrapMask;
+ if(bSigned)
+ {
+ aEndPts[0].B.SignExtend(Prec);
+ aEndPts[1].A.SignExtend(Prec);
+ aEndPts[1].B.SignExtend(Prec);
+ }
+}
+
+inline static float Norm(_In_ const INTColor& a, _In_ const INTColor& b)
+{
+ float dr = float(a.r) - float(b.r);
+ float dg = float(a.g) - float(b.g);
+ float db = float(a.b) - float(b.b);
+ return dr * dr + dg * dg + db * db;
+}
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+inline static int NBits(_In_ int n, _In_ bool bIsSigned)
+{
+ int nb;
+ if(n == 0)
+ {
+ return 0; // no bits needed for 0, signed or not
+ }
+ else if(n > 0)
+ {
+ for(nb = 0; n; ++nb, n >>= 1);
+ return nb + (bIsSigned ? 1 : 0);
+ }
+ else
+ {
+ assert(bIsSigned);
+ for(nb = 0; n < -1; ++nb, n >>= 1) ;
+ return nb + 1;
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+static float OptimizeRGB(_In_count_c_(NUM_PIXELS_PER_BLOCK) const HDRColorA* const pPoints,
+ _Out_ HDRColorA* pX, _Out_ HDRColorA* pY,
+ _In_ size_t cSteps, _In_ size_t cPixels, _In_count_(cPixels) const size_t* pIndex)
+{
+ float fError = FLT_MAX;
+ const float *pC = (3 == cSteps) ? pC3 : pC4;
+ const float *pD = (3 == cSteps) ? pD3 : pD4;
+
+ // Find Min and Max points, as starting point
+ HDRColorA X(1.0f, 1.0f, 1.0f, 0.0f);
+ HDRColorA Y(0.0f, 0.0f, 0.0f, 0.0f);
+
+ for(size_t iPoint = 0; iPoint < cPixels; iPoint++)
+ {
+ if(pPoints[pIndex[iPoint]].r < X.r) X.r = pPoints[pIndex[iPoint]].r;
+ if(pPoints[pIndex[iPoint]].g < X.g) X.g = pPoints[pIndex[iPoint]].g;
+ if(pPoints[pIndex[iPoint]].b < X.b) X.b = pPoints[pIndex[iPoint]].b;
+ if(pPoints[pIndex[iPoint]].r > Y.r) Y.r = pPoints[pIndex[iPoint]].r;
+ if(pPoints[pIndex[iPoint]].g > Y.g) Y.g = pPoints[pIndex[iPoint]].g;
+ if(pPoints[pIndex[iPoint]].b > Y.b) Y.b = pPoints[pIndex[iPoint]].b;
+ }
+
+ // Diagonal axis
+ HDRColorA AB;
+ AB.r = Y.r - X.r;
+ AB.g = Y.g - X.g;
+ AB.b = Y.b - X.b;
+
+ float fAB = AB.r * AB.r + AB.g * AB.g + AB.b * AB.b;
+
+ // Single color block.. no need to root-find
+ if(fAB < FLT_MIN)
+ {
+ pX->r = X.r; pX->g = X.g; pX->b = X.b;
+ pY->r = Y.r; pY->g = Y.g; pY->b = Y.b;
+ return 0.0f;
+ }
+
+ // Try all four axis directions, to determine which diagonal best fits data
+ float fABInv = 1.0f / fAB;
+
+ HDRColorA Dir;
+ Dir.r = AB.r * fABInv;
+ Dir.g = AB.g * fABInv;
+ Dir.b = AB.b * fABInv;
+
+ HDRColorA Mid;
+ Mid.r = (X.r + Y.r) * 0.5f;
+ Mid.g = (X.g + Y.g) * 0.5f;
+ Mid.b = (X.b + Y.b) * 0.5f;
+
+ float fDir[4];
+ fDir[0] = fDir[1] = fDir[2] = fDir[3] = 0.0f;
+
+ for(size_t iPoint = 0; iPoint < cPixels; iPoint++)
+ {
+ HDRColorA Pt;
+ Pt.r = (pPoints[pIndex[iPoint]].r - Mid.r) * Dir.r;
+ Pt.g = (pPoints[pIndex[iPoint]].g - Mid.g) * Dir.g;
+ Pt.b = (pPoints[pIndex[iPoint]].b - Mid.b) * Dir.b;
+
+ float f;
+ f = Pt.r + Pt.g + Pt.b; fDir[0] += f * f;
+ f = Pt.r + Pt.g - Pt.b; fDir[1] += f * f;
+ f = Pt.r - Pt.g + Pt.b; fDir[2] += f * f;
+ f = Pt.r - Pt.g - Pt.b; fDir[3] += f * f;
+ }
+
+ float fDirMax = fDir[0];
+ size_t iDirMax = 0;
+
+ for(size_t iDir = 1; iDir < 4; iDir++)
+ {
+ if(fDir[iDir] > fDirMax)
+ {
+ fDirMax = fDir[iDir];
+ iDirMax = iDir;
+ }
+ }
+
+ if(iDirMax & 2) Swap( X.g, Y.g );
+ if(iDirMax & 1) Swap( X.b, Y.b );
+
+ // Two color block.. no need to root-find
+ if(fAB < 1.0f / 4096.0f)
+ {
+ pX->r = X.r; pX->g = X.g; pX->b = X.b;
+ pY->r = Y.r; pY->g = Y.g; pY->b = Y.b;
+ return 0.0f;
+ }
+
+ // Use Newton's Method to find local minima of sum-of-squares error.
+ float fSteps = (float) (cSteps - 1);
+
+ for(size_t iIteration = 0; iIteration < 8; iIteration++)
+ {
+ // Calculate new steps
+ HDRColorA pSteps[4];
+
+ for(size_t iStep = 0; iStep < cSteps; iStep++)
+ {
+ pSteps[iStep].r = X.r * pC[iStep] + Y.r * pD[iStep];
+ pSteps[iStep].g = X.g * pC[iStep] + Y.g * pD[iStep];
+ pSteps[iStep].b = X.b * pC[iStep] + Y.b * pD[iStep];
+ }
+
+ // Calculate color direction
+ Dir.r = Y.r - X.r;
+ Dir.g = Y.g - X.g;
+ Dir.b = Y.b - X.b;
+
+ float fLen = (Dir.r * Dir.r + Dir.g * Dir.g + Dir.b * Dir.b);
+
+ if(fLen < (1.0f / 4096.0f))
+ break;
+
+ float fScale = fSteps / fLen;
+
+ Dir.r *= fScale;
+ Dir.g *= fScale;
+ Dir.b *= fScale;
+
+ // Evaluate function, and derivatives
+ float d2X = 0.0f, d2Y = 0.0f;
+ HDRColorA dX(0.0f, 0.0f, 0.0f, 0.0f), dY(0.0f, 0.0f, 0.0f, 0.0f);
+
+ for(size_t iPoint = 0; iPoint < cPixels; iPoint++)
+ {
+ float fDot = (pPoints[pIndex[iPoint]].r - X.r) * Dir.r +
+ (pPoints[pIndex[iPoint]].g - X.g) * Dir.g +
+ (pPoints[pIndex[iPoint]].b - X.b) * Dir.b;
+
+ size_t iStep;
+ if(fDot <= 0.0f)
+ iStep = 0;
+ if(fDot >= fSteps)
+ iStep = cSteps - 1;
+ else
+ iStep = size_t(fDot + 0.5f);
+
+ HDRColorA Diff;
+ Diff.r = pSteps[iStep].r - pPoints[pIndex[iPoint]].r;
+ Diff.g = pSteps[iStep].g - pPoints[pIndex[iPoint]].g;
+ Diff.b = pSteps[iStep].b - pPoints[pIndex[iPoint]].b;
+
+ float fC = pC[iStep] * (1.0f / 8.0f);
+ float fD = pD[iStep] * (1.0f / 8.0f);
+
+ d2X += fC * pC[iStep];
+ dX.r += fC * Diff.r;
+ dX.g += fC * Diff.g;
+ dX.b += fC * Diff.b;
+
+ d2Y += fD * pD[iStep];
+ dY.r += fD * Diff.r;
+ dY.g += fD * Diff.g;
+ dY.b += fD * Diff.b;
+ }
+
+ // Move endpoints
+ if(d2X > 0.0f)
+ {
+ float f = -1.0f / d2X;
+
+ X.r += dX.r * f;
+ X.g += dX.g * f;
+ X.b += dX.b * f;
+ }
+
+ if(d2Y > 0.0f)
+ {
+ float f = -1.0f / d2Y;
+
+ Y.r += dY.r * f;
+ Y.g += dY.g * f;
+ Y.b += dY.b * f;
+ }
+
+ if((dX.r * dX.r < fEpsilon) && (dX.g * dX.g < fEpsilon) && (dX.b * dX.b < fEpsilon) &&
+ (dY.r * dY.r < fEpsilon) && (dY.g * dY.g < fEpsilon) && (dY.b * dY.b < fEpsilon))
+ {
+ break;
+ }
+ }
+
+ pX->r = X.r; pX->g = X.g; pX->b = X.b;
+ pY->r = Y.r; pY->g = Y.g; pY->b = Y.b;
+ return fError;
+}
+
+
+//-------------------------------------------------------------------------------------
+static float OptimizeRGBA(_In_count_c_(NUM_PIXELS_PER_BLOCK) const HDRColorA* const pPoints,
+ _Out_ HDRColorA* pX, _Out_ HDRColorA* pY,
+ _In_ size_t cSteps, _In_ size_t cPixels, _In_count_(cPixels) const size_t* pIndex)
+{
+ float fError = FLT_MAX;
+ const float *pC = (3 == cSteps) ? pC3 : pC4;
+ const float *pD = (3 == cSteps) ? pD3 : pD4;
+
+ // Find Min and Max points, as starting point
+ HDRColorA X(1.0f, 1.0f, 1.0f, 1.0f);
+ HDRColorA Y(0.0f, 0.0f, 0.0f, 0.0f);
+
+ for(size_t iPoint = 0; iPoint < cPixels; iPoint++)
+ {
+ if(pPoints[pIndex[iPoint]].r < X.r) X.r = pPoints[pIndex[iPoint]].r;
+ if(pPoints[pIndex[iPoint]].g < X.g) X.g = pPoints[pIndex[iPoint]].g;
+ if(pPoints[pIndex[iPoint]].b < X.b) X.b = pPoints[pIndex[iPoint]].b;
+ if(pPoints[pIndex[iPoint]].a < X.a) X.a = pPoints[pIndex[iPoint]].a;
+ if(pPoints[pIndex[iPoint]].r > Y.r) Y.r = pPoints[pIndex[iPoint]].r;
+ if(pPoints[pIndex[iPoint]].g > Y.g) Y.g = pPoints[pIndex[iPoint]].g;
+ if(pPoints[pIndex[iPoint]].b > Y.b) Y.b = pPoints[pIndex[iPoint]].b;
+ if(pPoints[pIndex[iPoint]].a > Y.a) Y.a = pPoints[pIndex[iPoint]].a;
+ }
+
+ // Diagonal axis
+ HDRColorA AB = Y - X;
+ float fAB = AB * AB;
+
+ // Single color block.. no need to root-find
+ if(fAB < FLT_MIN)
+ {
+ *pX = X;
+ *pY = Y;
+ return 0.0f;
+ }
+
+ // Try all four axis directions, to determine which diagonal best fits data
+ float fABInv = 1.0f / fAB;
+ HDRColorA Dir = AB * fABInv;
+ HDRColorA Mid = (X + Y) * 0.5f;
+
+ float fDir[8];
+ fDir[0] = fDir[1] = fDir[2] = fDir[3] = fDir[4] = fDir[5] = fDir[6] = fDir[7] = 0.0f;
+
+ for(size_t iPoint = 0; iPoint < cPixels; iPoint++)
+ {
+ HDRColorA Pt;
+ Pt.r = (pPoints[pIndex[iPoint]].r - Mid.r) * Dir.r;
+ Pt.g = (pPoints[pIndex[iPoint]].g - Mid.g) * Dir.g;
+ Pt.b = (pPoints[pIndex[iPoint]].b - Mid.b) * Dir.b;
+ Pt.a = (pPoints[pIndex[iPoint]].a - Mid.a) * Dir.a;
+
+ float f;
+ f = Pt.r + Pt.g + Pt.b + Pt.a; fDir[0] += f * f;
+ f = Pt.r + Pt.g + Pt.b - Pt.a; fDir[1] += f * f;
+ f = Pt.r + Pt.g - Pt.b + Pt.a; fDir[2] += f * f;
+ f = Pt.r + Pt.g - Pt.b - Pt.a; fDir[3] += f * f;
+ f = Pt.r - Pt.g + Pt.b + Pt.a; fDir[4] += f * f;
+ f = Pt.r - Pt.g + Pt.b - Pt.a; fDir[5] += f * f;
+ f = Pt.r - Pt.g - Pt.b + Pt.a; fDir[6] += f * f;
+ f = Pt.r - Pt.g - Pt.b - Pt.a; fDir[7] += f * f;
+ }
+
+ float fDirMax = fDir[0];
+ size_t iDirMax = 0;
+
+ for(size_t iDir = 1; iDir < 8; iDir++)
+ {
+ if(fDir[iDir] > fDirMax)
+ {
+ fDirMax = fDir[iDir];
+ iDirMax = iDir;
+ }
+ }
+
+ if(iDirMax & 4) Swap(X.g, Y.g);
+ if(iDirMax & 2) Swap(X.b, Y.b);
+ if(iDirMax & 1) Swap(X.a, Y.a);
+
+ // Two color block.. no need to root-find
+ if(fAB < 1.0f / 4096.0f)
+ {
+ *pX = X;
+ *pY = Y;
+ return 0.0f;
+ }
+
+ // Use Newton's Method to find local minima of sum-of-squares error.
+ float fSteps = (float) (cSteps - 1);
+
+ for(size_t iIteration = 0; iIteration < 8 && fError > 0.0f; iIteration++)
+ {
+ // Calculate new steps
+ HDRColorA pSteps[BC7_MAX_INDICES];
+
+ LDRColorA aSteps[BC7_MAX_INDICES];
+ LDRColorA lX, lY;
+ lX = (X * 255.0f).ToLDRColorA();
+ lY = (Y * 255.0f).ToLDRColorA();
+
+ for(size_t iStep = 0; iStep < cSteps; iStep++)
+ {
+ pSteps[iStep] = X * pC[iStep] + Y * pD[iStep];
+ //LDRColorA::Interpolate(lX, lY, i, i, wcprec, waprec, aSteps[i]);
+ }
+
+ // Calculate color direction
+ Dir = Y - X;
+ float fLen = Dir * Dir;
+ if(fLen < (1.0f / 4096.0f))
+ break;
+
+ float fScale = fSteps / fLen;
+ Dir *= fScale;
+
+ // Evaluate function, and derivatives
+ float d2X = 0.0f, d2Y = 0.0f;
+ HDRColorA dX(0.0f, 0.0f, 0.0f, 0.0f), dY(0.0f, 0.0f, 0.0f, 0.0f);
+
+ for(size_t iPoint = 0; iPoint < cPixels; ++iPoint)
+ {
+ float fDot = (pPoints[pIndex[iPoint]] - X) * Dir;
+ size_t iStep;
+ if(fDot <= 0.0f)
+ iStep = 0;
+ if(fDot >= fSteps)
+ iStep = cSteps - 1;
+ else
+ iStep = size_t(fDot + 0.5f);
+
+ HDRColorA Diff = pSteps[iStep] - pPoints[pIndex[iPoint]];
+ float fC = pC[iStep] * (1.0f / 8.0f);
+ float fD = pD[iStep] * (1.0f / 8.0f);
+
+ d2X += fC * pC[iStep];
+ dX += Diff * fC;
+
+ d2Y += fD * pD[iStep];
+ dY += Diff * fD;
+ }
+
+ // Move endpoints
+ if(d2X > 0.0f)
+ {
+ float f = -1.0f / d2X;
+ X += dX * f;
+ }
+
+ if(d2Y > 0.0f)
+ {
+ float f = -1.0f / d2Y;
+ Y += dY * f;
+ }
+
+ if((dX * dX < fEpsilon) && (dY * dY < fEpsilon))
+ break;
+ }
+
+ *pX = X;
+ *pY = Y;
+ return fError;
+}
+
+
+//-------------------------------------------------------------------------------------
+#pragma warning(disable: 4616 6001 6297)
+
+static float ComputeError(_Inout_ const LDRColorA& pixel, _In_count_x_(1 << uIndexPrec) const LDRColorA aPalette[],
+ _In_ uint8_t uIndexPrec, _In_ uint8_t uIndexPrec2, _Out_opt_ size_t* pBestIndex = nullptr, _Out_opt_ size_t* pBestIndex2 = nullptr)
+{
+ const size_t uNumIndices = 1 << uIndexPrec;
+ const size_t uNumIndices2 = 1 << uIndexPrec2;
+ float fTotalErr = 0;
+ float fBestErr = FLT_MAX;
+
+ if(pBestIndex)
+ *pBestIndex = 0;
+ if(pBestIndex2)
+ *pBestIndex2 = 0;
+
+ if(uIndexPrec2 == 0)
+ {
+ for(register size_t i = 0; i < uNumIndices && fBestErr > 0; i++)
+ {
+ float fErr = ErrorMetric(pixel, aPalette[i]);
+ if(fErr > fBestErr) // error increased, so we're done searching
+ break;
+ if(fErr < fBestErr)
+ {
+ fBestErr = fErr;
+ if(pBestIndex)
+ *pBestIndex = i;
+ }
+ }
+ fTotalErr += fBestErr;
+ }
+ else
+ {
+ for(register size_t i = 0; i < uNumIndices && fBestErr > 0; i++)
+ {
+ float fErr = ErrorMetricRGB(pixel, aPalette[i]);
+ if(fErr > fBestErr) // error increased, so we're done searching
+ break;
+ if(fErr < fBestErr)
+ {
+ fBestErr = fErr;
+ if(pBestIndex)
+ *pBestIndex = i;
+ }
+ }
+ fTotalErr += fBestErr;
+ fBestErr = FLT_MAX;
+ for(register size_t i = 0; i < uNumIndices2 && fBestErr > 0; i++)
+ {
+ float fErr = ErrorMetricAlpha(pixel, aPalette[i]);
+ if(fErr > fBestErr) // error increased, so we're done searching
+ break;
+ if(fErr < fBestErr)
+ {
+ fBestErr = fErr;
+ if(pBestIndex2)
+ *pBestIndex2 = i;
+ }
+ }
+ fTotalErr += fBestErr;
+ }
+
+ return fTotalErr;
+}
+
+
+inline static void FillWithErrorColors( _Out_cap_c_(NUM_PIXELS_PER_BLOCK) HDRColorA* pOut )
+{
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+#ifdef _DEBUG
+ // Use Magenta in debug as a highly-visible error color
+ pOut[i] = HDRColorA(1.0f, 0.0f, 1.0f, 1.0f);
+#else
+ // In production use, default to black
+ pOut[i] = HDRColorA(0.0f, 0.0f, 0.0f, 1.0f);
+#endif
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+// BC6H Compression
+//-------------------------------------------------------------------------------------
+void D3DX_BC6H::Decode(bool bSigned, HDRColorA* pOut) const
+{
+ assert(pOut );
+
+ size_t uStartBit = 0;
+ uint8_t uMode = GetBits(uStartBit, 2);
+ if(uMode != 0x00 && uMode != 0x01)
+ {
+ uMode = (GetBits(uStartBit, 3) << 2) | uMode;
+ }
+
+ assert( uMode < 32 );
+ __analysis_assume( uMode < 32 );
+
+ if ( ms_aModeToInfo[uMode] >= 0 )
+ {
+ assert(ms_aModeToInfo[uMode] < ARRAYSIZE(ms_aInfo));
+ __analysis_assume(ms_aModeToInfo[uMode] < ARRAYSIZE(ms_aInfo));
+ const ModeDescriptor* desc = ms_aDesc[ms_aModeToInfo[uMode]];
+
+ assert(ms_aModeToInfo[uMode] < ARRAYSIZE(ms_aDesc));
+ __analysis_assume(ms_aModeToInfo[uMode] < ARRAYSIZE(ms_aDesc));
+ const ModeInfo& info = ms_aInfo[ms_aModeToInfo[uMode]];
+
+ INTEndPntPair aEndPts[BC6H_MAX_REGIONS];
+ memset(aEndPts, 0, BC6H_MAX_REGIONS * 2 * sizeof(INTColor));
+ uint32_t uShape = 0;
+
+ // Read header
+ const size_t uHeaderBits = info.uPartitions > 0 ? 82 : 65;
+ while(uStartBit < uHeaderBits)
+ {
+ size_t uCurBit = uStartBit;
+ if(GetBit(uStartBit))
+ {
+ switch(desc[uCurBit].m_eField)
+ {
+ case D: uShape |= 1 << uint32_t(desc[uCurBit].m_uBit); break;
+ case RW: aEndPts[0].A.r |= 1 << uint32_t(desc[uCurBit].m_uBit); break;
+ case RX: aEndPts[0].B.r |= 1 << uint32_t(desc[uCurBit].m_uBit); break;
+ case RY: aEndPts[1].A.r |= 1 << uint32_t(desc[uCurBit].m_uBit); break;
+ case RZ: aEndPts[1].B.r |= 1 << uint32_t(desc[uCurBit].m_uBit); break;
+ case GW: aEndPts[0].A.g |= 1 << uint32_t(desc[uCurBit].m_uBit); break;
+ case GX: aEndPts[0].B.g |= 1 << uint32_t(desc[uCurBit].m_uBit); break;
+ case GY: aEndPts[1].A.g |= 1 << uint32_t(desc[uCurBit].m_uBit); break;
+ case GZ: aEndPts[1].B.g |= 1 << uint32_t(desc[uCurBit].m_uBit); break;
+ case BW: aEndPts[0].A.b |= 1 << uint32_t(desc[uCurBit].m_uBit); break;
+ case BX: aEndPts[0].B.b |= 1 << uint32_t(desc[uCurBit].m_uBit); break;
+ case BY: aEndPts[1].A.b |= 1 << uint32_t(desc[uCurBit].m_uBit); break;
+ case BZ: aEndPts[1].B.b |= 1 << uint32_t(desc[uCurBit].m_uBit); break;
+ default:
+ {
+#ifdef _DEBUG
+ OutputDebugStringA( "BC6H: Invalid header bits encountered during decoding\n" );
+#endif
+ FillWithErrorColors( pOut );
+ return;
+ }
+ }
+ }
+ }
+
+ assert( uShape < 64 );
+ __analysis_assume( uShape < 64 );
+
+ // Sign extend necessary end points
+ if(bSigned)
+ {
+ aEndPts[0].A.SignExtend(info.RGBAPrec[0][0]);
+ }
+ if(bSigned || info.bTransformed)
+ {
+ assert( info.uPartitions < BC6H_MAX_REGIONS );
+ __analysis_assume( info.uPartitions < BC6H_MAX_REGIONS );
+ for(size_t p = 0; p <= info.uPartitions; ++p)
+ {
+ if(p != 0)
+ {
+ aEndPts[p].A.SignExtend(info.RGBAPrec[p][0]);
+ }
+ aEndPts[p].B.SignExtend(info.RGBAPrec[p][1]);
+ }
+ }
+
+ // Inverse transform the end points
+ if(info.bTransformed)
+ {
+ TransformInverse(aEndPts, info.RGBAPrec[0][0], bSigned);
+ }
+
+ // Read indices
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ size_t uNumBits = IsFixUpOffset(info.uPartitions, uShape, i) ? info.uIndexPrec-1 : info.uIndexPrec;
+ if ( uStartBit + uNumBits > 128 )
+ {
+#ifdef _DEBUG
+ OutputDebugStringA( "BC6H: Invalid block encountered during decoding\n" );
+#endif
+ FillWithErrorColors( pOut );
+ return;
+ }
+ uint8_t uIndex = GetBits(uStartBit, uNumBits);
+
+ if ( uIndex >= ((info.uPartitions > 0) ? 8 : 16) )
+ {
+#ifdef _DEBUG
+ OutputDebugStringA( "BC6H: Invalid index encountered during decoding\n" );
+#endif
+ FillWithErrorColors( pOut );
+ return;
+ }
+
+ size_t uRegion = g_aPartitionTable[info.uPartitions][uShape][i];
+ assert( uRegion < BC6H_MAX_REGIONS );
+ __analysis_assume( uRegion < BC6H_MAX_REGIONS );
+
+ // Unquantize endpoints and interpolate
+ int r1 = Unquantize(aEndPts[uRegion].A.r, info.RGBAPrec[0][0].r, bSigned);
+ int g1 = Unquantize(aEndPts[uRegion].A.g, info.RGBAPrec[0][0].g, bSigned);
+ int b1 = Unquantize(aEndPts[uRegion].A.b, info.RGBAPrec[0][0].b, bSigned);
+ int r2 = Unquantize(aEndPts[uRegion].B.r, info.RGBAPrec[0][0].r, bSigned);
+ int g2 = Unquantize(aEndPts[uRegion].B.g, info.RGBAPrec[0][0].g, bSigned);
+ int b2 = Unquantize(aEndPts[uRegion].B.b, info.RGBAPrec[0][0].b, bSigned);
+ const int* aWeights = info.uPartitions > 0 ? g_aWeights3 : g_aWeights4;
+ INTColor fc;
+ fc.r = FinishUnquantize((r1 * (BC67_WEIGHT_MAX - aWeights[uIndex]) + r2 * aWeights[uIndex] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT, bSigned);
+ fc.g = FinishUnquantize((g1 * (BC67_WEIGHT_MAX - aWeights[uIndex]) + g2 * aWeights[uIndex] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT, bSigned);
+ fc.b = FinishUnquantize((b1 * (BC67_WEIGHT_MAX - aWeights[uIndex]) + b2 * aWeights[uIndex] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT, bSigned);
+
+ HALF rgb[3];
+ fc.ToF16(rgb, bSigned);
+
+ pOut[i].r = XMConvertHalfToFloat( rgb[0] );
+ pOut[i].g = XMConvertHalfToFloat( rgb[1] );
+ pOut[i].b = XMConvertHalfToFloat( rgb[2] );
+ pOut[i].a = 1.0f;
+ }
+ }
+ else
+ {
+#ifdef _DEBUG
+ OutputDebugStringA( "BC6H: Invalid mode encountered during decoding\n" );
+#endif
+ FillWithErrorColors( pOut );
+ }
+}
+
+void D3DX_BC6H::Encode(bool bSigned, const HDRColorA* const pIn)
+{
+ assert( pIn );
+
+ EncodeParams EP(pIn, bSigned);
+
+ for(EP.uMode = 0; EP.uMode < ARRAYSIZE(ms_aInfo) && EP.fBestErr > 0; ++EP.uMode)
+ {
+ const uint8_t uShapes = ms_aInfo[EP.uMode].uPartitions ? 32 : 1;
+ // Number of rough cases to look at. reasonable values of this are 1, uShapes/4, and uShapes
+ // uShapes/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+ const size_t uItems = std::max<size_t>(1, uShapes >> 2);
+ float afRoughMSE[BC6H_MAX_SHAPES];
+ uint8_t auShape[BC6H_MAX_SHAPES];
+
+ // pick the best uItems shapes and refine these.
+ for(EP.uShape = 0; EP.uShape < uShapes; ++EP.uShape)
+ {
+ size_t uShape = EP.uShape;
+ afRoughMSE[uShape] = RoughMSE(&EP);
+ auShape[uShape] = static_cast<uint8_t>(uShape);
+ }
+
+ // Bubble up the first uItems items
+ for(register size_t i = 0; i < uItems; i++)
+ {
+ for(register size_t j = i + 1; j < uShapes; j++)
+ {
+ if(afRoughMSE[i] > afRoughMSE[j])
+ {
+ Swap(afRoughMSE[i], afRoughMSE[j]);
+ Swap(auShape[i], auShape[j]);
+ }
+ }
+ }
+
+ for(size_t i = 0; i < uItems && EP.fBestErr > 0; i++)
+ {
+ EP.uShape = auShape[i];
+ Refine(&EP);
+ }
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+int D3DX_BC6H::Quantize(int iValue, int prec, bool bSigned)
+{
+ assert(prec > 1); // didn't bother to make it work for 1
+ int q, s = 0;
+ if(bSigned)
+ {
+ assert(iValue >= -F16MAX && iValue <= F16MAX);
+ if(iValue < 0)
+ {
+ s = 1;
+ iValue = -iValue;
+ }
+ q = (prec >= 16) ? iValue : (iValue << (prec-1)) / (F16MAX+1);
+ if(s)
+ q = -q;
+ assert (q > -(1 << (prec-1)) && q < (1 << (prec-1)));
+ }
+ else
+ {
+ assert(iValue >= 0 && iValue <= F16MAX);
+ q = (prec >= 15) ? iValue : (iValue << prec) / (F16MAX+1);
+ assert (q >= 0 && q < (1 << prec));
+ }
+
+ return q;
+}
+
+int D3DX_BC6H::Unquantize(int comp, uint8_t uBitsPerComp, bool bSigned)
+{
+ int unq = 0, s = 0;
+ if(bSigned)
+ {
+ if(uBitsPerComp >= 16)
+ {
+ unq = comp;
+ }
+ else
+ {
+ if(comp < 0)
+ {
+ s = 1;
+ comp = -comp;
+ }
+
+ if(comp == 0) unq = 0;
+ else if(comp >= ((1 << (uBitsPerComp - 1)) - 1)) unq = 0x7FFF;
+ else unq = ((comp << 15) + 0x4000) >> (uBitsPerComp-1);
+
+ if(s) unq = -unq;
+ }
+ }
+ else
+ {
+ if(uBitsPerComp >= 15) unq = comp;
+ else if(comp == 0) unq = 0;
+ else if(comp == ((1 << uBitsPerComp) - 1)) unq = 0xFFFF;
+ else unq = ((comp << 16) + 0x8000) >> uBitsPerComp;
+ }
+
+ return unq;
+}
+
+int D3DX_BC6H::FinishUnquantize(int comp, bool bSigned)
+{
+ if(bSigned)
+ {
+ return (comp < 0) ? -(((-comp) * 31) >> 5) : (comp * 31) >> 5; // scale the magnitude by 31/32
+ }
+ else
+ {
+ return (comp * 31) >> 6; // scale the magnitude by 31/64
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+bool D3DX_BC6H::EndPointsFit(const EncodeParams* pEP, const INTEndPntPair aEndPts[])
+{
+ assert( pEP );
+ const bool bTransformed = ms_aInfo[pEP->uMode].bTransformed;
+ const bool bIsSigned = pEP->bSigned;
+ const LDRColorA& Prec0 = ms_aInfo[pEP->uMode].RGBAPrec[0][0];
+ const LDRColorA& Prec1 = ms_aInfo[pEP->uMode].RGBAPrec[0][1];
+ const LDRColorA& Prec2 = ms_aInfo[pEP->uMode].RGBAPrec[1][0];
+ const LDRColorA& Prec3 = ms_aInfo[pEP->uMode].RGBAPrec[1][1];
+
+ INTColor aBits[4];
+ aBits[0].r = NBits(aEndPts[0].A.r, bIsSigned);
+ aBits[0].g = NBits(aEndPts[0].A.g, bIsSigned);
+ aBits[0].b = NBits(aEndPts[0].A.b, bIsSigned);
+ aBits[1].r = NBits(aEndPts[0].B.r, bTransformed || bIsSigned);
+ aBits[1].g = NBits(aEndPts[0].B.g, bTransformed || bIsSigned);
+ aBits[1].b = NBits(aEndPts[0].B.b, bTransformed || bIsSigned);
+ if(aBits[0].r > Prec0.r || aBits[1].r > Prec1.r ||
+ aBits[0].g > Prec0.g || aBits[1].g > Prec1.g ||
+ aBits[0].b > Prec0.b || aBits[1].b > Prec1.b)
+ return false;
+
+ if(ms_aInfo[pEP->uMode].uPartitions)
+ {
+ aBits[2].r = NBits(aEndPts[1].A.r, bTransformed || bIsSigned);
+ aBits[2].g = NBits(aEndPts[1].A.g, bTransformed || bIsSigned);
+ aBits[2].b = NBits(aEndPts[1].A.b, bTransformed || bIsSigned);
+ aBits[3].r = NBits(aEndPts[1].B.r, bTransformed || bIsSigned);
+ aBits[3].g = NBits(aEndPts[1].B.g, bTransformed || bIsSigned);
+ aBits[3].b = NBits(aEndPts[1].B.b, bTransformed || bIsSigned);
+
+ if(aBits[2].r > Prec2.r || aBits[3].r > Prec3.r ||
+ aBits[2].g > Prec2.g || aBits[3].g > Prec3.g ||
+ aBits[2].b > Prec2.b || aBits[3].b > Prec3.b)
+ return false;
+ }
+
+ return true;
+}
+
+void D3DX_BC6H::GeneratePaletteQuantized(const EncodeParams* pEP, const INTEndPntPair& endPts, INTColor aPalette[]) const
+{
+ assert( pEP );
+ const size_t uIndexPrec = ms_aInfo[pEP->uMode].uIndexPrec;
+ const size_t uNumIndices = 1 << uIndexPrec;
+ assert( uNumIndices > 0 );
+ __analysis_assume( uNumIndices > 0 );
+ const LDRColorA& Prec = ms_aInfo[pEP->uMode].RGBAPrec[0][0];
+
+ // scale endpoints
+ INTEndPntPair unqEndPts;
+ unqEndPts.A.r = Unquantize(endPts.A.r, Prec.r, pEP->bSigned);
+ unqEndPts.A.g = Unquantize(endPts.A.g, Prec.g, pEP->bSigned);
+ unqEndPts.A.b = Unquantize(endPts.A.b, Prec.b, pEP->bSigned);
+ unqEndPts.B.r = Unquantize(endPts.B.r, Prec.r, pEP->bSigned);
+ unqEndPts.B.g = Unquantize(endPts.B.g, Prec.g, pEP->bSigned);
+ unqEndPts.B.b = Unquantize(endPts.B.b, Prec.b, pEP->bSigned);
+
+ // interpolate
+ const int* aWeights = nullptr;
+ switch(uIndexPrec)
+ {
+ case 3: aWeights = g_aWeights3; assert(uNumIndices <= 8); __analysis_assume(uNumIndices <= 8); break;
+ case 4: aWeights = g_aWeights4; assert(uNumIndices <= 16); __analysis_assume(uNumIndices <= 16); break;
+ default: assert(false); for(size_t i=0; i < uNumIndices; ++i) aPalette[i] = INTColor(0,0,0); return;
+ }
+
+ for (size_t i = 0; i < uNumIndices; ++i)
+ {
+ aPalette[i].r = FinishUnquantize(
+ (unqEndPts.A.r * (BC67_WEIGHT_MAX - aWeights[i]) + unqEndPts.B.r * aWeights[i] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT,
+ pEP->bSigned);
+ aPalette[i].g = FinishUnquantize(
+ (unqEndPts.A.g * (BC67_WEIGHT_MAX - aWeights[i]) + unqEndPts.B.g * aWeights[i] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT,
+ pEP->bSigned);
+ aPalette[i].b = FinishUnquantize(
+ (unqEndPts.A.b * (BC67_WEIGHT_MAX - aWeights[i]) + unqEndPts.B.b * aWeights[i] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT,
+ pEP->bSigned);
+ }
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+float D3DX_BC6H::MapColorsQuantized(const EncodeParams* pEP, const INTColor aColors[], size_t np, const INTEndPntPair &endPts) const
+{
+ assert( pEP );
+
+ const uint8_t uIndexPrec = ms_aInfo[pEP->uMode].uIndexPrec;
+ const uint8_t uNumIndices = 1 << uIndexPrec;
+ INTColor aPalette[BC6H_MAX_INDICES];
+ GeneratePaletteQuantized(pEP, endPts, aPalette);
+
+ float fTotErr = 0;
+ for(size_t i = 0; i < np; ++i)
+ {
+ float fBestErr = Norm(aColors[i], aPalette[0]);
+ for(int j = 1; j < uNumIndices && fBestErr > 0; ++j)
+ {
+ float fErr = Norm(aColors[i], aPalette[j]);
+ if(fErr > fBestErr) break; // error increased, so we're done searching
+ if(fErr < fBestErr) fBestErr = fErr;
+ }
+ fTotErr += fBestErr;
+ }
+ return fTotErr;
+}
+
+float D3DX_BC6H::PerturbOne(const EncodeParams* pEP, const INTColor aColors[], size_t np, uint8_t ch,
+ const INTEndPntPair& oldEndPts, INTEndPntPair& newEndPts, float fOldErr, int do_b) const
+{
+ assert( pEP );
+ uint8_t uPrec;
+ switch(ch)
+ {
+ case 0: uPrec = ms_aInfo[pEP->uMode].RGBAPrec[0][0].r; break;
+ case 1: uPrec = ms_aInfo[pEP->uMode].RGBAPrec[0][0].g; break;
+ case 2: uPrec = ms_aInfo[pEP->uMode].RGBAPrec[0][0].b; break;
+ default: assert(false); newEndPts = oldEndPts; return FLT_MAX;
+ }
+ INTEndPntPair tmpEndPts;
+ float fMinErr = fOldErr;
+ int beststep = 0;
+
+ // copy real endpoints so we can perturb them
+ tmpEndPts = newEndPts = oldEndPts;
+
+ // do a logarithmic search for the best error for this endpoint (which)
+ for(int step = 1 << (uPrec-1); step; step >>= 1)
+ {
+ bool bImproved = false;
+ for(int sign = -1; sign <= 1; sign += 2)
+ {
+ if(do_b == 0)
+ {
+ tmpEndPts.A[ch] = newEndPts.A[ch] + sign * step;
+ if(tmpEndPts.A[ch] < 0 || tmpEndPts.A[ch] >= (1 << uPrec))
+ continue;
+ }
+ else
+ {
+ tmpEndPts.B[ch] = newEndPts.B[ch] + sign * step;
+ if(tmpEndPts.B[ch] < 0 || tmpEndPts.B[ch] >= (1 << uPrec))
+ continue;
+ }
+
+ float fErr = MapColorsQuantized(pEP, aColors, np, tmpEndPts);
+
+ if(fErr < fMinErr)
+ {
+ bImproved = true;
+ fMinErr = fErr;
+ beststep = sign * step;
+ }
+ }
+ // if this was an improvement, move the endpoint and continue search from there
+ if(bImproved)
+ {
+ if(do_b == 0)
+ newEndPts.A[ch] += beststep;
+ else
+ newEndPts.B[ch] += beststep;
+ }
+ }
+ return fMinErr;
+}
+
+void D3DX_BC6H::OptimizeOne(const EncodeParams* pEP, const INTColor aColors[], size_t np, float aOrgErr,
+ const INTEndPntPair &aOrgEndPts, INTEndPntPair &aOptEndPts) const
+{
+ assert( pEP );
+ float aOptErr = aOrgErr;
+ aOptEndPts.A = aOrgEndPts.A;
+ aOptEndPts.B = aOrgEndPts.B;
+
+ INTEndPntPair new_a, new_b;
+ INTEndPntPair newEndPts;
+ int do_b;
+
+ // now optimize each channel separately
+ for(uint8_t ch = 0; ch < 3; ++ch)
+ {
+ // figure out which endpoint when perturbed gives the most improvement and start there
+ // if we just alternate, we can easily end up in a local minima
+ float fErr0 = PerturbOne(pEP, aColors, np, ch, aOptEndPts, new_a, aOptErr, 0); // perturb endpt A
+ float fErr1 = PerturbOne(pEP, aColors, np, ch, aOptEndPts, new_b, aOptErr, 1); // perturb endpt B
+
+ if(fErr0 < fErr1)
+ {
+ if(fErr0 >= aOptErr) continue;
+ aOptEndPts.A[ch] = new_a.A[ch];
+ aOptErr = fErr0;
+ do_b = 1; // do B next
+ }
+ else
+ {
+ if(fErr1 >= aOptErr) continue;
+ aOptEndPts.B[ch] = new_b.B[ch];
+ aOptErr = fErr1;
+ do_b = 0; // do A next
+ }
+
+ // now alternate endpoints and keep trying until there is no improvement
+ for(;;)
+ {
+ float fErr = PerturbOne(pEP, aColors, np, ch, aOptEndPts, newEndPts, aOptErr, do_b);
+ if(fErr >= aOptErr)
+ break;
+ if(do_b == 0)
+ aOptEndPts.A[ch] = newEndPts.A[ch];
+ else
+ aOptEndPts.B[ch] = newEndPts.B[ch];
+ aOptErr = fErr;
+ do_b = 1 - do_b; // now move the other endpoint
+ }
+ }
+}
+
+void D3DX_BC6H::OptimizeEndPoints(const EncodeParams* pEP, const float aOrgErr[], const INTEndPntPair aOrgEndPts[], INTEndPntPair aOptEndPts[]) const
+{
+ assert( pEP );
+ const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions;
+ assert( uPartitions < BC6H_MAX_REGIONS );
+ __analysis_assume( uPartitions < BC6H_MAX_REGIONS );
+ INTColor aPixels[NUM_PIXELS_PER_BLOCK];
+
+ for(size_t p = 0; p <= uPartitions; ++p)
+ {
+ // collect the pixels in the region
+ size_t np = 0;
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ if(g_aPartitionTable[p][pEP->uShape][i] == p)
+ {
+ aPixels[np++] = pEP->aIPixels[i];
+ }
+ }
+
+ OptimizeOne(pEP, aPixels, np, aOrgErr[p], aOrgEndPts[p], aOptEndPts[p]);
+ }
+}
+
+// Swap endpoints as needed to ensure that the indices at fix up have a 0 high-order bit
+void D3DX_BC6H::SwapIndices(const EncodeParams* pEP, INTEndPntPair aEndPts[], size_t aIndices[])
+{
+ assert( pEP );
+ const size_t uPartitions = ms_aInfo[pEP->uMode].uPartitions;
+ const size_t uNumIndices = 1 << ms_aInfo[pEP->uMode].uIndexPrec;
+ const size_t uHighIndexBit = uNumIndices >> 1;
+
+ assert( uPartitions < BC6H_MAX_REGIONS && pEP->uShape < BC6H_MAX_SHAPES );
+ __analysis_assume( uPartitions < BC6H_MAX_REGIONS && pEP->uShape < BC6H_MAX_SHAPES );
+
+ for(size_t p = 0; p <= uPartitions; ++p)
+ {
+ size_t i = g_aFixUp[uPartitions][pEP->uShape][p];
+ assert(g_aPartitionTable[uPartitions][pEP->uShape][i] == p);
+ if(aIndices[i] & uHighIndexBit)
+ {
+ // high bit is set, swap the aEndPts and indices for this region
+ Swap(aEndPts[p].A, aEndPts[p].B);
+
+ for(size_t j = 0; j < NUM_PIXELS_PER_BLOCK; ++j)
+ if(g_aPartitionTable[uPartitions][pEP->uShape][j] == p)
+ aIndices[j] = uNumIndices - 1 - aIndices[j];
+ }
+ }
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+void D3DX_BC6H::AssignIndices(const EncodeParams* pEP, const INTEndPntPair aEndPts[], size_t aIndices[], float aTotErr[]) const
+{
+ assert( pEP );
+ const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions;
+ const uint8_t uNumIndices = 1 << ms_aInfo[pEP->uMode].uIndexPrec;
+
+ assert( uPartitions < BC6H_MAX_REGIONS && pEP->uShape < BC6H_MAX_SHAPES );
+ __analysis_assume( uPartitions < BC6H_MAX_REGIONS && pEP->uShape < BC6H_MAX_SHAPES );
+
+ // build list of possibles
+ INTColor aPalette[BC6H_MAX_REGIONS][BC6H_MAX_INDICES];
+
+ for(size_t p = 0; p <= uPartitions; ++p)
+ {
+ GeneratePaletteQuantized(pEP, aEndPts[p], aPalette[p]);
+ aTotErr[p] = 0;
+ }
+
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ const uint8_t uRegion = g_aPartitionTable[uPartitions][pEP->uShape][i];
+ assert( uRegion < BC6H_MAX_REGIONS );
+ __analysis_assume( uRegion < BC6H_MAX_REGIONS );
+ float fBestErr = Norm(pEP->aIPixels[i], aPalette[uRegion][0]);
+ aIndices[i] = 0;
+
+ for(uint8_t j = 1; j < uNumIndices && fBestErr > 0; ++j)
+ {
+ float fErr = Norm(pEP->aIPixels[i], aPalette[uRegion][j]);
+ if(fErr > fBestErr) break; // error increased, so we're done searching
+ if(fErr < fBestErr)
+ {
+ fBestErr = fErr;
+ aIndices[i] = j;
+ }
+ }
+ aTotErr[uRegion] += fBestErr;
+ }
+}
+
+void D3DX_BC6H::QuantizeEndPts(const EncodeParams* pEP, INTEndPntPair* aQntEndPts) const
+{
+ assert( pEP && aQntEndPts );
+ const INTEndPntPair* aUnqEndPts = pEP->aUnqEndPts[pEP->uShape];
+ const LDRColorA& Prec = ms_aInfo[pEP->uMode].RGBAPrec[0][0];
+ const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions;
+ assert( uPartitions < BC6H_MAX_REGIONS );
+ __analysis_assume( uPartitions < BC6H_MAX_REGIONS );
+
+ for(size_t p = 0; p <= uPartitions; ++p)
+ {
+ aQntEndPts[p].A.r = Quantize(aUnqEndPts[p].A.r, Prec.r, pEP->bSigned);
+ aQntEndPts[p].A.g = Quantize(aUnqEndPts[p].A.g, Prec.g, pEP->bSigned);
+ aQntEndPts[p].A.b = Quantize(aUnqEndPts[p].A.b, Prec.b, pEP->bSigned);
+ aQntEndPts[p].B.r = Quantize(aUnqEndPts[p].B.r, Prec.r, pEP->bSigned);
+ aQntEndPts[p].B.g = Quantize(aUnqEndPts[p].B.g, Prec.g, pEP->bSigned);
+ aQntEndPts[p].B.b = Quantize(aUnqEndPts[p].B.b, Prec.b, pEP->bSigned);
+ }
+}
+
+void D3DX_BC6H::EmitBlock(const EncodeParams* pEP, const INTEndPntPair aEndPts[], const size_t aIndices[])
+{
+ assert( pEP );
+ const uint8_t uRealMode = ms_aInfo[pEP->uMode].uMode;
+ const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions;
+ const uint8_t uIndexPrec = ms_aInfo[pEP->uMode].uIndexPrec;
+ const size_t uHeaderBits = uPartitions > 0 ? 82 : 65;
+ const ModeDescriptor* desc = ms_aDesc[pEP->uMode];
+ size_t uStartBit = 0;
+
+ while(uStartBit < uHeaderBits)
+ {
+ switch(desc[uStartBit].m_eField)
+ {
+ case M: SetBit(uStartBit, uint8_t(uRealMode >> desc[uStartBit].m_uBit) & 0x01); break;
+ case D: SetBit(uStartBit, uint8_t(pEP->uShape >> desc[uStartBit].m_uBit) & 0x01); break;
+ case RW: SetBit(uStartBit, uint8_t(aEndPts[0].A.r >> desc[uStartBit].m_uBit) & 0x01); break;
+ case RX: SetBit(uStartBit, uint8_t(aEndPts[0].B.r >> desc[uStartBit].m_uBit) & 0x01); break;
+ case RY: SetBit(uStartBit, uint8_t(aEndPts[1].A.r >> desc[uStartBit].m_uBit) & 0x01); break;
+ case RZ: SetBit(uStartBit, uint8_t(aEndPts[1].B.r >> desc[uStartBit].m_uBit) & 0x01); break;
+ case GW: SetBit(uStartBit, uint8_t(aEndPts[0].A.g >> desc[uStartBit].m_uBit) & 0x01); break;
+ case GX: SetBit(uStartBit, uint8_t(aEndPts[0].B.g >> desc[uStartBit].m_uBit) & 0x01); break;
+ case GY: SetBit(uStartBit, uint8_t(aEndPts[1].A.g >> desc[uStartBit].m_uBit) & 0x01); break;
+ case GZ: SetBit(uStartBit, uint8_t(aEndPts[1].B.g >> desc[uStartBit].m_uBit) & 0x01); break;
+ case BW: SetBit(uStartBit, uint8_t(aEndPts[0].A.b >> desc[uStartBit].m_uBit) & 0x01); break;
+ case BX: SetBit(uStartBit, uint8_t(aEndPts[0].B.b >> desc[uStartBit].m_uBit) & 0x01); break;
+ case BY: SetBit(uStartBit, uint8_t(aEndPts[1].A.b >> desc[uStartBit].m_uBit) & 0x01); break;
+ case BZ: SetBit(uStartBit, uint8_t(aEndPts[1].B.b >> desc[uStartBit].m_uBit) & 0x01); break;
+ default: assert(false);
+ }
+ }
+
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ if(IsFixUpOffset(ms_aInfo[pEP->uMode].uPartitions, pEP->uShape, i))
+ SetBits(uStartBit, uIndexPrec - 1, static_cast<uint8_t>( aIndices[i] ));
+ else
+ SetBits(uStartBit, uIndexPrec, static_cast<uint8_t>( aIndices[i] ));
+ }
+ assert(uStartBit == 128);
+}
+
+void D3DX_BC6H::Refine(EncodeParams* pEP)
+{
+ assert( pEP );
+ const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions;
+ assert( uPartitions < BC6H_MAX_REGIONS );
+ __analysis_assume( uPartitions < BC6H_MAX_REGIONS );
+
+ const bool bTransformed = ms_aInfo[pEP->uMode].bTransformed;
+ float aOrgErr[BC6H_MAX_REGIONS], aOptErr[BC6H_MAX_REGIONS];
+ INTEndPntPair aOrgEndPts[BC6H_MAX_REGIONS], aOptEndPts[BC6H_MAX_REGIONS];
+ size_t aOrgIdx[NUM_PIXELS_PER_BLOCK], aOptIdx[NUM_PIXELS_PER_BLOCK];
+
+ QuantizeEndPts(pEP, aOrgEndPts);
+ AssignIndices(pEP, aOrgEndPts, aOrgIdx, aOrgErr);
+ SwapIndices(pEP, aOrgEndPts, aOrgIdx);
+
+ if(bTransformed) TransformForward(aOrgEndPts);
+ if(EndPointsFit(pEP, aOrgEndPts))
+ {
+ if(bTransformed) TransformInverse(aOrgEndPts, ms_aInfo[pEP->uMode].RGBAPrec[0][0], pEP->bSigned);
+ OptimizeEndPoints(pEP, aOrgErr, aOrgEndPts, aOptEndPts);
+ AssignIndices(pEP, aOptEndPts, aOptIdx, aOptErr);
+ SwapIndices(pEP, aOptEndPts, aOptIdx);
+
+ float fOrgTotErr = 0.0f, fOptTotErr = 0.0f;
+ for(size_t p = 0; p <= uPartitions; ++p)
+ {
+ fOrgTotErr += aOrgErr[p];
+ fOptTotErr += aOptErr[p];
+ }
+
+ if(bTransformed) TransformForward(aOptEndPts);
+ if(EndPointsFit(pEP, aOptEndPts) && fOptTotErr < fOrgTotErr && fOptTotErr < pEP->fBestErr)
+ {
+ pEP->fBestErr = fOptTotErr;
+ EmitBlock(pEP, aOptEndPts, aOptIdx);
+ }
+ else if(fOrgTotErr < pEP->fBestErr)
+ {
+ // either it stopped fitting when we optimized it, or there was no improvement
+ // so go back to the unoptimized endpoints which we know will fit
+ if(bTransformed) TransformForward(aOrgEndPts);
+ pEP->fBestErr = fOrgTotErr;
+ EmitBlock(pEP, aOrgEndPts, aOrgIdx);
+ }
+ }
+}
+
+void D3DX_BC6H::GeneratePaletteUnquantized(const EncodeParams* pEP, size_t uRegion, INTColor aPalette[])
+{
+ assert( pEP );
+ assert( uRegion < BC6H_MAX_REGIONS && pEP->uShape < BC6H_MAX_SHAPES );
+ __analysis_assume( uRegion < BC6H_MAX_REGIONS && pEP->uShape < BC6H_MAX_SHAPES );
+ const INTEndPntPair& endPts = pEP->aUnqEndPts[pEP->uShape][uRegion];
+ const uint8_t uIndexPrec = ms_aInfo[pEP->uMode].uIndexPrec;
+ const uint8_t uNumIndices = 1 << uIndexPrec;
+ assert( uNumIndices > 0 );
+ __analysis_assume( uNumIndices > 0 );
+
+ const int* aWeights = nullptr;
+ switch(uIndexPrec)
+ {
+ case 3: aWeights = g_aWeights3; assert(uNumIndices <= 8); __analysis_assume(uNumIndices <= 8); break;
+ case 4: aWeights = g_aWeights4; assert(uNumIndices <= 16); __analysis_assume(uNumIndices <= 16); break;
+ default: assert(false); for(size_t i = 0; i < uNumIndices; ++i) aPalette[i] = INTColor(0,0,0); return;
+ }
+
+ for(register size_t i = 0; i < uNumIndices; ++i)
+ {
+ aPalette[i].r = (endPts.A.r * (BC67_WEIGHT_MAX - aWeights[i]) + endPts.B.r * aWeights[i] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT;
+ aPalette[i].g = (endPts.A.g * (BC67_WEIGHT_MAX - aWeights[i]) + endPts.B.g * aWeights[i] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT;
+ aPalette[i].b = (endPts.A.b * (BC67_WEIGHT_MAX - aWeights[i]) + endPts.B.b * aWeights[i] + BC67_WEIGHT_ROUND) >> BC67_WEIGHT_SHIFT;
+ }
+}
+
+float D3DX_BC6H::MapColors(const EncodeParams* pEP, size_t uRegion, size_t np, const size_t* auIndex) const
+{
+ assert( pEP );
+ const uint8_t uIndexPrec = ms_aInfo[pEP->uMode].uIndexPrec;
+ const uint8_t uNumIndices = 1 << uIndexPrec;
+ INTColor aPalette[BC6H_MAX_INDICES];
+ GeneratePaletteUnquantized(pEP, uRegion, aPalette);
+
+ float fTotalErr = 0.0f;
+ for(size_t i = 0; i < np; ++i)
+ {
+ float fBestErr = Norm(pEP->aIPixels[auIndex[i]], aPalette[0]);
+ for(uint8_t j = 1; j < uNumIndices && fBestErr > 0.0f; ++j)
+ {
+ float fErr = Norm(pEP->aIPixels[auIndex[i]], aPalette[j]);
+ if(fErr > fBestErr) break; // error increased, so we're done searching
+ if(fErr < fBestErr) fBestErr = fErr;
+ }
+ fTotalErr += fBestErr;
+ }
+
+ return fTotalErr;
+}
+
+float D3DX_BC6H::RoughMSE(EncodeParams* pEP) const
+{
+ assert( pEP );
+ assert( pEP->uShape < BC6H_MAX_SHAPES);
+ __analysis_assume( pEP->uShape < BC6H_MAX_SHAPES);
+
+ INTEndPntPair* aEndPts = pEP->aUnqEndPts[pEP->uShape];
+
+ const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions;
+ assert( uPartitions < BC6H_MAX_REGIONS );
+ __analysis_assume( uPartitions < BC6H_MAX_REGIONS );
+
+ size_t auPixIdx[NUM_PIXELS_PER_BLOCK];
+
+ float fError = 0.0f;
+ for(size_t p = 0; p <= uPartitions; ++p)
+ {
+ size_t np = 0;
+ for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ if(g_aPartitionTable[uPartitions][pEP->uShape][i] == p)
+ {
+ auPixIdx[np++] = i;
+ }
+ }
+
+ // handle simple cases
+ assert(np > 0);
+ if(np == 1)
+ {
+ aEndPts[p].A = pEP->aIPixels[auPixIdx[0]];
+ aEndPts[p].B = pEP->aIPixels[auPixIdx[0]];
+ continue;
+ }
+ else if(np == 2)
+ {
+ aEndPts[p].A = pEP->aIPixels[auPixIdx[0]];
+ aEndPts[p].B = pEP->aIPixels[auPixIdx[1]];
+ continue;
+ }
+
+ HDRColorA epA, epB;
+ OptimizeRGB(pEP->aHDRPixels, &epA, &epB, 4, np, auPixIdx);
+ aEndPts[p].A.Set(epA, pEP->bSigned);
+ aEndPts[p].B.Set(epB, pEP->bSigned);
+ if(pEP->bSigned)
+ {
+ aEndPts[p].A.Clamp(-F16MAX, F16MAX);
+ aEndPts[p].B.Clamp(-F16MAX, F16MAX);
+ }
+ else
+ {
+ aEndPts[p].A.Clamp(0, F16MAX);
+ aEndPts[p].B.Clamp(0, F16MAX);
+ }
+
+ fError += MapColors(pEP, p, np, auPixIdx);
+ }
+
+ return fError;
+}
+
+
+
+//-------------------------------------------------------------------------------------
+// BC7 Compression
+//-------------------------------------------------------------------------------------
+void D3DX_BC7::Decode(HDRColorA* pOut) const
+{
+ assert( pOut );
+
+ size_t uFirst = 0;
+ while(uFirst < 128 && !GetBit(uFirst)) {}
+ uint8_t uMode = uint8_t(uFirst - 1);
+
+ if(uMode < 8)
+ {
+ const uint8_t uPartitions = ms_aInfo[uMode].uPartitions;
+ assert( uPartitions < BC7_MAX_REGIONS );
+ __analysis_assume( uPartitions < BC7_MAX_REGIONS );
+
+ const uint8_t uNumEndPts = (uPartitions + 1) << 1;
+ const uint8_t uIndexPrec = ms_aInfo[uMode].uIndexPrec;
+ const uint8_t uIndexPrec2 = ms_aInfo[uMode].uIndexPrec2;
+ register size_t i;
+ size_t uStartBit = uMode + 1;
+ uint8_t P[6];
+ uint8_t uShape = GetBits(uStartBit, ms_aInfo[uMode].uPartitionBits);
+ assert( uShape < BC7_MAX_SHAPES );
+ __analysis_assume( uShape < BC7_MAX_SHAPES );
+
+ uint8_t uRotation = GetBits(uStartBit, ms_aInfo[uMode].uRotationBits);
+ assert( uRotation < 4 );
+
+ uint8_t uIndexMode = GetBits(uStartBit, ms_aInfo[uMode].uIndexModeBits);
+ assert( uIndexMode < 2 );
+
+ LDRColorA c[BC7_MAX_REGIONS << 1];
+ const LDRColorA RGBAPrec = ms_aInfo[uMode].RGBAPrec;
+ const LDRColorA RGBAPrecWithP = ms_aInfo[uMode].RGBAPrecWithP;
+
+ assert( uNumEndPts <= (BC7_MAX_REGIONS << 1) );
+
+ // Red channel
+ for(i = 0; i < uNumEndPts; i++)
+ {
+ if ( uStartBit + RGBAPrec.r > 128 )
+ {
+#ifdef _DEBUG
+ OutputDebugStringA( "BC7: Invalid block encountered during decoding\n" );
+#endif
+ FillWithErrorColors( pOut );
+ return;
+ }
+
+ c[i].r = GetBits(uStartBit, RGBAPrec.r);
+ }
+
+ // Green channel
+ for(i = 0; i < uNumEndPts; i++)
+ {
+ if ( uStartBit + RGBAPrec.g > 128 )
+ {
+#ifdef _DEBUG
+ OutputDebugStringA( "BC7: Invalid block encountered during decoding\n" );
+#endif
+ FillWithErrorColors( pOut );
+ return;
+ }
+
+ c[i].g = GetBits(uStartBit, RGBAPrec.g);
+ }
+
+ // Blue channel
+ for(i = 0; i < uNumEndPts; i++)
+ {
+ if ( uStartBit + RGBAPrec.b > 128 )
+ {
+#ifdef _DEBUG
+ OutputDebugStringA( "BC7: Invalid block encountered during decoding\n" );
+#endif
+ FillWithErrorColors( pOut );
+ return;
+ }
+
+ c[i].b = GetBits(uStartBit, RGBAPrec.b);
+ }
+
+ // Alpha channel
+ for(i = 0; i < uNumEndPts; i++)
+ {
+ if ( uStartBit + RGBAPrec.a > 128 )
+ {
+#ifdef _DEBUG
+ OutputDebugStringA( "BC7: Invalid block encountered during decoding\n" );
+#endif
+ FillWithErrorColors( pOut );
+ return;
+ }
+
+ c[i].a = RGBAPrec.a ? GetBits(uStartBit, RGBAPrec.a) : 255;
+ }
+
+ // P-bits
+ assert( ms_aInfo[uMode].uPBits <= 6 );
+ __analysis_assume( ms_aInfo[uMode].uPBits <= 6 );
+ for(i = 0; i < ms_aInfo[uMode].uPBits; i++)
+ {
+ if ( uStartBit > 127 )
+ {
+#ifdef _DEBUG
+ OutputDebugStringA( "BC7: Invalid block encountered during decoding\n" );
+#endif
+ FillWithErrorColors( pOut );
+ return;
+ }
+
+ P[i] = GetBit(uStartBit);
+ }
+
+ if(ms_aInfo[uMode].uPBits)
+ {
+ for(i = 0; i < uNumEndPts; i++)
+ {
+ size_t pi = i * ms_aInfo[uMode].uPBits / uNumEndPts;
+ for(register uint8_t ch = 0; ch < BC7_NUM_CHANNELS; ch++)
+ {
+ if(RGBAPrec[ch] != RGBAPrecWithP[ch])
+ {
+ c[i][ch] = (c[i][ch] << 1) | P[pi];
+ }
+ }
+ }
+ }
+
+ for(i = 0; i < uNumEndPts; i++)
+ {
+ c[i] = Unquantize(c[i], RGBAPrecWithP);
+ }
+
+ uint8_t w1[NUM_PIXELS_PER_BLOCK], w2[NUM_PIXELS_PER_BLOCK];
+
+ // read color indices
+ for(i = 0; i < NUM_PIXELS_PER_BLOCK; i++)
+ {
+ size_t uNumBits = IsFixUpOffset(ms_aInfo[uMode].uPartitions, uShape, i) ? uIndexPrec - 1 : uIndexPrec;
+ if ( uStartBit + uNumBits > 128 )
+ {
+#ifdef _DEBUG
+ OutputDebugStringA( "BC7: Invalid block encountered during decoding\n" );
+#endif
+ FillWithErrorColors( pOut );
+ return;
+ }
+ w1[i] = GetBits(uStartBit, uNumBits);
+ }
+
+ // read alpha indices
+ if(uIndexPrec2)
+ {
+ for(i = 0; i < NUM_PIXELS_PER_BLOCK; i++)
+ {
+ size_t uNumBits = i ? uIndexPrec2 : uIndexPrec2 - 1;
+ if ( uStartBit + uNumBits > 128 )
+ {
+#ifdef _DEBUG
+ OutputDebugStringA( "BC7: Invalid block encountered during decoding\n" );
+#endif
+ FillWithErrorColors( pOut );
+ return;
+ }
+ w2[i] = GetBits(uStartBit, uNumBits );
+ }
+ }
+
+ for(i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ uint8_t uRegion = g_aPartitionTable[uPartitions][uShape][i];
+ LDRColorA outPixel;
+ if(uIndexPrec2 == 0)
+ {
+ LDRColorA::Interpolate(c[uRegion << 1], c[(uRegion << 1) + 1], w1[i], w1[i], uIndexPrec, uIndexPrec, outPixel);
+ }
+ else
+ {
+ if(uIndexMode == 0)
+ {
+ LDRColorA::Interpolate(c[uRegion << 1], c[(uRegion << 1) + 1], w1[i], w2[i], uIndexPrec, uIndexPrec2, outPixel);
+ }
+ else
+ {
+ LDRColorA::Interpolate(c[uRegion << 1], c[(uRegion << 1) + 1], w2[i], w1[i], uIndexPrec2, uIndexPrec, outPixel);
+ }
+ }
+
+ switch(uRotation)
+ {
+ case 1: Swap(outPixel.r, outPixel.a); break;
+ case 2: Swap(outPixel.g, outPixel.a); break;
+ case 3: Swap(outPixel.b, outPixel.a); break;
+ }
+
+ pOut[i] = HDRColorA(outPixel);
+ }
+ }
+ else
+ {
+#ifdef _DEBUG
+ OutputDebugStringA( "BC7: Invalid mode encountered during decoding\n" );
+#endif
+ FillWithErrorColors( pOut );
+ }
+}
+
+void D3DX_BC7::Encode(const HDRColorA* const pIn)
+{
+ assert( pIn );
+
+ D3DX_BC7 final = *this;
+ EncodeParams EP(pIn);
+ float fMSEBest = FLT_MAX;
+
+ for(size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ EP.aLDRPixels[i].r = uint8_t( std::max<float>( 0.0f, std::min<float>( 255.0f, pIn[i].r * 255.0f + 0.01f ) ) );
+ EP.aLDRPixels[i].g = uint8_t( std::max<float>( 0.0f, std::min<float>( 255.0f, pIn[i].g * 255.0f + 0.01f ) ) );
+ EP.aLDRPixels[i].b = uint8_t( std::max<float>( 0.0f, std::min<float>( 255.0f, pIn[i].b * 255.0f + 0.01f ) ) );
+ EP.aLDRPixels[i].a = uint8_t( std::max<float>( 0.0f, std::min<float>( 255.0f, pIn[i].a * 255.0f + 0.01f ) ) );
+ }
+
+ for(EP.uMode = 0; EP.uMode < 8 && fMSEBest > 0; ++EP.uMode)
+ {
+ const size_t uShapes = 1 << ms_aInfo[EP.uMode].uPartitionBits;
+ assert( uShapes <= BC7_MAX_SHAPES );
+ __analysis_assume( uShapes <= BC7_MAX_SHAPES );
+
+ const size_t uNumRots = 1 << ms_aInfo[EP.uMode].uRotationBits;
+ const size_t uNumIdxMode = 1 << ms_aInfo[EP.uMode].uIndexModeBits;
+ // Number of rough cases to look at. reasonable values of this are 1, uShapes/4, and uShapes
+ // uShapes/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+ const size_t uItems = std::max<size_t>(1, uShapes >> 2);
+ float afRoughMSE[BC7_MAX_SHAPES];
+ size_t auShape[BC7_MAX_SHAPES];
+
+ for(size_t r = 0; r < uNumRots && fMSEBest > 0; ++r)
+ {
+ switch(r)
+ {
+ case 1: for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) Swap(EP.aLDRPixels[i].r, EP.aLDRPixels[i].a); break;
+ case 2: for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) Swap(EP.aLDRPixels[i].g, EP.aLDRPixels[i].a); break;
+ case 3: for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) Swap(EP.aLDRPixels[i].b, EP.aLDRPixels[i].a); break;
+ }
+
+ for(size_t im = 0; im < uNumIdxMode && fMSEBest > 0; ++im)
+ {
+ // pick the best uItems shapes and refine these.
+ for(size_t s = 0; s < uShapes; s++)
+ {
+ afRoughMSE[s] = RoughMSE(&EP, s, im);
+ auShape[s] = s;
+ }
+
+ // Bubble up the first uItems items
+ for(size_t i = 0; i < uItems; i++)
+ {
+ for(size_t j = i + 1; j < uShapes; j++)
+ {
+ if(afRoughMSE[i] > afRoughMSE[j])
+ {
+ Swap(afRoughMSE[i], afRoughMSE[j]);
+ Swap(auShape[i], auShape[j]);
+ }
+ }
+ }
+
+ for(size_t i = 0; i < uItems && fMSEBest > 0; i++)
+ {
+ float fMSE = Refine(&EP, auShape[i], r, im);
+ if(fMSE < fMSEBest)
+ {
+ final = *this;
+ fMSEBest = fMSE;
+ }
+ }
+ }
+
+ switch(r)
+ {
+ case 1: for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) Swap(EP.aLDRPixels[i].r, EP.aLDRPixels[i].a); break;
+ case 2: for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) Swap(EP.aLDRPixels[i].g, EP.aLDRPixels[i].a); break;
+ case 3: for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++) Swap(EP.aLDRPixels[i].b, EP.aLDRPixels[i].a); break;
+ }
+ }
+ }
+
+ *this = final;
+}
+
+
+//-------------------------------------------------------------------------------------
+void D3DX_BC7::GeneratePaletteQuantized(const EncodeParams* pEP, size_t uIndexMode, const LDREndPntPair& endPts, LDRColorA aPalette[]) const
+{
+ assert( pEP );
+ const size_t uIndexPrec = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec2 : ms_aInfo[pEP->uMode].uIndexPrec;
+ const size_t uIndexPrec2 = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec : ms_aInfo[pEP->uMode].uIndexPrec2;
+ const size_t uNumIndices = 1 << uIndexPrec;
+ const size_t uNumIndices2 = 1 << uIndexPrec2;
+ assert( uNumIndices > 0 && uNumIndices2 > 0 );
+ __analysis_assume( uNumIndices > 0 && uNumIndices2 > 0 );
+ assert( (uNumIndices <= BC7_MAX_INDICES) && (uNumIndices2 <= BC7_MAX_INDICES) );
+ __analysis_assume( (uNumIndices <= BC7_MAX_INDICES) && (uNumIndices2 <= BC7_MAX_INDICES) );
+
+ LDRColorA a = Unquantize(endPts.A, ms_aInfo[pEP->uMode].RGBAPrecWithP);
+ LDRColorA b = Unquantize(endPts.B, ms_aInfo[pEP->uMode].RGBAPrecWithP);
+ if(uIndexPrec2 == 0)
+ {
+ for(register size_t i = 0; i < uNumIndices; i++)
+ LDRColorA::Interpolate(a, b, i, i, uIndexPrec, uIndexPrec, aPalette[i]);
+ }
+ else
+ {
+ for(register size_t i = 0; i < uNumIndices; i++)
+ LDRColorA::InterpolateRGB(a, b, i, uIndexPrec, aPalette[i]);
+ for(register size_t i = 0; i < uNumIndices2; i++)
+ LDRColorA::InterpolateA(a, b, i, uIndexPrec2, aPalette[i]);
+ }
+}
+
+float D3DX_BC7::PerturbOne(const EncodeParams* pEP, const LDRColorA aColors[], size_t np, size_t uIndexMode, size_t ch,
+ const LDREndPntPair &oldEndPts, LDREndPntPair &newEndPts, float fOldErr, uint8_t do_b) const
+{
+ assert( pEP );
+ const int prec = ms_aInfo[pEP->uMode].RGBAPrecWithP[ch];
+ LDREndPntPair tmp_endPts = newEndPts = oldEndPts;
+ float fMinErr = fOldErr;
+ uint8_t* pnew_c = (do_b ? &newEndPts.B[ch] : &newEndPts.A[ch]);
+ uint8_t* ptmp_c = (do_b ? &tmp_endPts.B[ch] : &tmp_endPts.A[ch]);
+
+ // do a logarithmic search for the best error for this endpoint (which)
+ for(int step = 1 << (prec-1); step; step >>= 1)
+ {
+ bool bImproved = false;
+ int beststep = 0;
+ for(int sign = -1; sign <= 1; sign += 2)
+ {
+ int tmp = int(*pnew_c) + sign * step;
+ if(tmp < 0 || tmp >= (1 << prec))
+ continue;
+ else
+ *ptmp_c = (uint8_t) tmp;
+
+ float fTotalErr = MapColors(pEP, aColors, np, uIndexMode, tmp_endPts, fMinErr);
+ if(fTotalErr < fMinErr)
+ {
+ bImproved = true;
+ fMinErr = fTotalErr;
+ beststep = sign * step;
+ }
+ }
+
+ // if this was an improvement, move the endpoint and continue search from there
+ if(bImproved)
+ *pnew_c = uint8_t(int(*pnew_c) + beststep);
+ }
+ return fMinErr;
+}
+
+// perturb the endpoints at least -3 to 3.
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+void D3DX_BC7::Exhaustive(const EncodeParams* pEP, const LDRColorA aColors[], size_t np, size_t uIndexMode, size_t ch,
+ float& fOrgErr, LDREndPntPair& optEndPt) const
+{
+ assert( pEP );
+ const uint8_t uPrec = ms_aInfo[pEP->uMode].RGBAPrecWithP[ch];
+ LDREndPntPair tmpEndPt;
+ if(fOrgErr == 0)
+ return;
+
+ int delta = 5;
+
+ // ok figure out the range of A and B
+ tmpEndPt = optEndPt;
+ int alow = std::max<int>(0, int(optEndPt.A[ch]) - delta);
+ int ahigh = std::min<int>((1 << uPrec) - 1, int(optEndPt.A[ch]) + delta);
+ int blow = std::max<int>(0, int(optEndPt.B[ch]) - delta);
+ int bhigh = std::min<int>((1 << uPrec) - 1, int(optEndPt.B[ch]) + delta);
+ int amin = 0;
+ int bmin = 0;
+
+ float fBestErr = fOrgErr;
+ if(optEndPt.A[ch] <= optEndPt.B[ch])
+ {
+ // keep a <= b
+ for(int a = alow; a <= ahigh; ++a)
+ {
+ for(int b = std::max<int>(a, blow); b < bhigh; ++b)
+ {
+ tmpEndPt.A[ch] = (uint8_t) a;
+ tmpEndPt.B[ch] = (uint8_t) b;
+
+ float fErr = MapColors(pEP, aColors, np, uIndexMode, tmpEndPt, fBestErr);
+ if(fErr < fBestErr)
+ {
+ amin = a;
+ bmin = b;
+ fBestErr = fErr;
+ }
+ }
+ }
+ }
+ else
+ {
+ // keep b <= a
+ for(int b = blow; b < bhigh; ++b)
+ {
+ for(int a = std::max<int>(b, alow); a <= ahigh; ++a)
+ {
+ tmpEndPt.A[ch] = (uint8_t) a;
+ tmpEndPt.B[ch] = (uint8_t) b;
+
+ float fErr = MapColors(pEP, aColors, np, uIndexMode, tmpEndPt, fBestErr);
+ if(fErr < fBestErr)
+ {
+ amin = a;
+ bmin = b;
+ fBestErr = fErr;
+ }
+ }
+ }
+ }
+
+ if(fBestErr < fOrgErr)
+ {
+ optEndPt.A[ch] = (uint8_t) amin;
+ optEndPt.B[ch] = (uint8_t) bmin;
+ fOrgErr = fBestErr;
+ }
+}
+
+void D3DX_BC7::OptimizeOne(const EncodeParams* pEP, const LDRColorA aColors[], size_t np, size_t uIndexMode,
+ float fOrgErr, const LDREndPntPair& org, LDREndPntPair& opt) const
+{
+ assert( pEP );
+
+ float fOptErr = fOrgErr;
+ opt = org;
+
+ LDREndPntPair new_a, new_b;
+ LDREndPntPair newEndPts;
+ uint8_t do_b;
+
+ // now optimize each channel separately
+ for(size_t ch = 0; ch < BC7_NUM_CHANNELS; ++ch)
+ {
+ if(ms_aInfo[pEP->uMode].RGBAPrecWithP[ch] == 0)
+ continue;
+
+ // figure out which endpoint when perturbed gives the most improvement and start there
+ // if we just alternate, we can easily end up in a local minima
+ float fErr0 = PerturbOne(pEP, aColors, np, uIndexMode, ch, opt, new_a, fOptErr, 0); // perturb endpt A
+ float fErr1 = PerturbOne(pEP, aColors, np, uIndexMode, ch, opt, new_b, fOptErr, 1); // perturb endpt B
+
+ uint8_t& copt_a = opt.A[ch];
+ uint8_t& copt_b = opt.B[ch];
+ uint8_t& cnew_a = new_a.A[ch];
+ uint8_t& cnew_b = new_a.B[ch];
+
+ if(fErr0 < fErr1)
+ {
+ if(fErr0 >= fOptErr)
+ continue;
+ copt_a = cnew_a;
+ fOptErr = fErr0;
+ do_b = 1; // do B next
+ }
+ else
+ {
+ if(fErr1 >= fOptErr)
+ continue;
+ copt_b = cnew_b;
+ fOptErr = fErr1;
+ do_b = 0; // do A next
+ }
+
+ // now alternate endpoints and keep trying until there is no improvement
+ for( ; ; )
+ {
+ float fErr = PerturbOne(pEP, aColors, np, uIndexMode, ch, opt, newEndPts, fOptErr, do_b);
+ if(fErr >= fOptErr)
+ break;
+ if(do_b == 0)
+ copt_a = cnew_a;
+ else
+ copt_b = cnew_b;
+ fOptErr = fErr;
+ do_b = 1 - do_b; // now move the other endpoint
+ }
+ }
+
+ // finally, do a small exhaustive search around what we think is the global minima to be sure
+ for(size_t ch = 0; ch < BC7_NUM_CHANNELS; ch++)
+ Exhaustive(pEP, aColors, np, uIndexMode, ch, fOptErr, opt);
+}
+
+void D3DX_BC7::OptimizeEndPoints(const EncodeParams* pEP, size_t uShape, size_t uIndexMode, const float afOrgErr[],
+ const LDREndPntPair aOrgEndPts[], LDREndPntPair aOptEndPts[]) const
+{
+ assert( pEP );
+ const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions;
+ assert( uPartitions < BC7_MAX_REGIONS && uShape < BC7_MAX_SHAPES );
+ __analysis_assume( uPartitions < BC7_MAX_REGIONS && uShape < BC7_MAX_SHAPES );
+
+ LDRColorA aPixels[NUM_PIXELS_PER_BLOCK];
+
+ for(size_t p = 0; p <= uPartitions; ++p)
+ {
+ // collect the pixels in the region
+ size_t np = 0;
+ for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ if(g_aPartitionTable[uPartitions][uShape][i] == p)
+ aPixels[np++] = pEP->aLDRPixels[i];
+
+ OptimizeOne(pEP, aPixels, np, uIndexMode, afOrgErr[p], aOrgEndPts[p], aOptEndPts[p]);
+ }
+}
+
+void D3DX_BC7::AssignIndices(const EncodeParams* pEP, size_t uShape, size_t uIndexMode, LDREndPntPair endPts[], size_t aIndices[], size_t aIndices2[],
+ float afTotErr[]) const
+{
+ assert( pEP );
+ assert( uShape < BC7_MAX_SHAPES );
+ __analysis_assume( uShape < BC7_MAX_SHAPES );
+
+ const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions;
+ assert( uPartitions < BC7_MAX_REGIONS );
+ __analysis_assume( uPartitions < BC7_MAX_REGIONS );
+
+ const uint8_t uIndexPrec = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec2 : ms_aInfo[pEP->uMode].uIndexPrec;
+ const uint8_t uIndexPrec2 = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec : ms_aInfo[pEP->uMode].uIndexPrec2;
+ const uint8_t uNumIndices = 1 << uIndexPrec;
+ const uint8_t uNumIndices2 = 1 << uIndexPrec2;
+
+ assert( (uNumIndices <= BC7_MAX_INDICES) && (uNumIndices2 <= BC7_MAX_INDICES) );
+ __analysis_assume( (uNumIndices <= BC7_MAX_INDICES) && (uNumIndices2 <= BC7_MAX_INDICES) );
+
+ const uint8_t uHighestIndexBit = uNumIndices >> 1;
+ const uint8_t uHighestIndexBit2 = uNumIndices2 >> 1;
+ LDRColorA aPalette[BC7_MAX_REGIONS][BC7_MAX_INDICES];
+
+ // build list of possibles
+ LDREndPntPair adjusted_endPts;
+ for(size_t p = 0; p <= uPartitions; p++)
+ {
+ GeneratePaletteQuantized(pEP, uIndexMode, endPts[p], aPalette[p]);
+ afTotErr[p] = 0;
+ }
+
+ for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++)
+ {
+ uint8_t uRegion = g_aPartitionTable[uPartitions][uShape][i];
+ assert( uRegion < BC7_MAX_REGIONS );
+ __analysis_assume( uRegion < BC7_MAX_REGIONS );
+ afTotErr[uRegion] += ComputeError(pEP->aLDRPixels[i], aPalette[uRegion], uIndexPrec, uIndexPrec2, &(aIndices[i]), &(aIndices2[i]));
+ }
+
+ // swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+ if(uIndexPrec2 == 0)
+ {
+ for(register size_t p = 0; p <= uPartitions; p++)
+ {
+ if(aIndices[g_aFixUp[uPartitions][uShape][p]] & uHighestIndexBit)
+ {
+ Swap(endPts[p].A, endPts[p].B);
+ for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++)
+ if(g_aPartitionTable[uPartitions][uShape][i] == p)
+ aIndices[i] = uNumIndices - 1 - aIndices[i];
+ }
+ assert((aIndices[g_aFixUp[uPartitions][uShape][p]] & uHighestIndexBit) == 0);
+ }
+ }
+ else
+ {
+ for(register size_t p = 0; p <= uPartitions; p++)
+ {
+ if(aIndices[g_aFixUp[uPartitions][uShape][p]] & uHighestIndexBit)
+ {
+ Swap(endPts[p].A.r, endPts[p].B.r);
+ Swap(endPts[p].A.g, endPts[p].B.g);
+ Swap(endPts[p].A.b, endPts[p].B.b);
+ for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++)
+ if(g_aPartitionTable[uPartitions][uShape][i] == p)
+ aIndices[i] = uNumIndices - 1 - aIndices[i];
+ }
+ assert((aIndices[g_aFixUp[uPartitions][uShape][p]] & uHighestIndexBit) == 0);
+
+ if(aIndices2[0] & uHighestIndexBit2)
+ {
+ Swap(endPts[p].A.a, endPts[p].B.a);
+ for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++)
+ aIndices2[i] = uNumIndices2 - 1 - aIndices2[i];
+ }
+ assert((aIndices2[0] & uHighestIndexBit2) == 0);
+ }
+ }
+}
+
+void D3DX_BC7::EmitBlock(const EncodeParams* pEP, size_t uShape, size_t uRotation, size_t uIndexMode, const LDREndPntPair aEndPts[], const size_t aIndex[], const size_t aIndex2[])
+{
+ assert( pEP );
+ const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions;
+ assert( uPartitions < BC7_MAX_REGIONS );
+ __analysis_assume( uPartitions < BC7_MAX_REGIONS );
+
+ const size_t uPBits = ms_aInfo[pEP->uMode].uPBits;
+ const size_t uIndexPrec = ms_aInfo[pEP->uMode].uIndexPrec;
+ const size_t uIndexPrec2 = ms_aInfo[pEP->uMode].uIndexPrec2;
+ const LDRColorA RGBAPrec = ms_aInfo[pEP->uMode].RGBAPrec;
+ const LDRColorA RGBAPrecWithP = ms_aInfo[pEP->uMode].RGBAPrecWithP;
+ register size_t i;
+ size_t uStartBit = 0;
+ SetBits(uStartBit, pEP->uMode, 0);
+ SetBits(uStartBit, 1, 1);
+ SetBits(uStartBit, ms_aInfo[pEP->uMode].uRotationBits, static_cast<uint8_t>( uRotation ));
+ SetBits(uStartBit, ms_aInfo[pEP->uMode].uIndexModeBits, static_cast<uint8_t>( uIndexMode ));
+ SetBits(uStartBit, ms_aInfo[pEP->uMode].uPartitionBits, static_cast<uint8_t>( uShape ));
+
+ if(uPBits)
+ {
+ const size_t uNumEP = (1 + uPartitions) << 1;
+ uint8_t aPVote[BC7_MAX_REGIONS << 1] = {0,0,0,0,0,0};
+ uint8_t aCount[BC7_MAX_REGIONS << 1] = {0,0,0,0,0,0};
+ for(uint8_t ch = 0; ch < BC7_NUM_CHANNELS; ch++)
+ {
+ uint8_t ep = 0;
+ for(i = 0; i <= uPartitions; i++)
+ {
+ if(RGBAPrec[ch] == RGBAPrecWithP[ch])
+ {
+ SetBits(uStartBit, RGBAPrec[ch], aEndPts[i].A[ch]);
+ SetBits(uStartBit, RGBAPrec[ch], aEndPts[i].B[ch]);
+ }
+ else
+ {
+ SetBits(uStartBit, RGBAPrec[ch], aEndPts[i].A[ch] >> 1);
+ SetBits(uStartBit, RGBAPrec[ch], aEndPts[i].B[ch] >> 1);
+ size_t idx = ep++ * uPBits / uNumEP;
+ assert(idx < (BC7_MAX_REGIONS << 1));
+ __analysis_assume(idx < (BC7_MAX_REGIONS << 1));
+ aPVote[idx] += aEndPts[i].A[ch] & 0x01;
+ aCount[idx]++;
+ idx = ep++ * uPBits / uNumEP;
+ assert(idx < (BC7_MAX_REGIONS << 1));
+ __analysis_assume(idx < (BC7_MAX_REGIONS << 1));
+ aPVote[idx] += aEndPts[i].B[ch] & 0x01;
+ aCount[idx]++;
+ }
+ }
+ }
+
+ for(i = 0; i < uPBits; i++)
+ {
+ SetBits(uStartBit, 1, aPVote[i] > (aCount[i] >> 1) ? 1 : 0);
+ }
+ }
+ else
+ {
+ for(size_t ch = 0; ch < BC7_NUM_CHANNELS; ch++)
+ {
+ for(i = 0; i <= uPartitions; i++)
+ {
+ SetBits(uStartBit, RGBAPrec[ch], aEndPts[i].A[ch] );
+ SetBits(uStartBit, RGBAPrec[ch], aEndPts[i].B[ch] );
+ }
+ }
+ }
+
+ const size_t* aI1 = uIndexMode ? aIndex2 : aIndex;
+ const size_t* aI2 = uIndexMode ? aIndex : aIndex2;
+ for(i = 0; i < NUM_PIXELS_PER_BLOCK; i++)
+ {
+ if(IsFixUpOffset(ms_aInfo[pEP->uMode].uPartitions, uShape, i))
+ SetBits(uStartBit, uIndexPrec - 1, static_cast<uint8_t>( aI1[i] ));
+ else
+ SetBits(uStartBit, uIndexPrec, static_cast<uint8_t>( aI1[i] ));
+ }
+ if(uIndexPrec2)
+ for(i = 0; i < NUM_PIXELS_PER_BLOCK; i++)
+ SetBits(uStartBit, i ? uIndexPrec2 : uIndexPrec2 - 1, static_cast<uint8_t>( aI2[i] ));
+
+ assert(uStartBit == 128);
+}
+
+float D3DX_BC7::Refine(const EncodeParams* pEP, size_t uShape, size_t uRotation, size_t uIndexMode)
+{
+ assert( pEP );
+ assert( uShape < BC7_MAX_SHAPES );
+ __analysis_assume( uShape < BC7_MAX_SHAPES );
+ const LDREndPntPair* aEndPts = pEP->aEndPts[uShape];
+
+ const size_t uPartitions = ms_aInfo[pEP->uMode].uPartitions;
+ assert( uPartitions < BC7_MAX_REGIONS );
+ __analysis_assume( uPartitions < BC7_MAX_REGIONS );
+
+ LDREndPntPair aOrgEndPts[BC7_MAX_REGIONS];
+ LDREndPntPair aOptEndPts[BC7_MAX_REGIONS];
+ size_t aOrgIdx[NUM_PIXELS_PER_BLOCK];
+ size_t aOrgIdx2[NUM_PIXELS_PER_BLOCK];
+ size_t aOptIdx[NUM_PIXELS_PER_BLOCK];
+ size_t aOptIdx2[NUM_PIXELS_PER_BLOCK];
+ float aOrgErr[BC7_MAX_REGIONS];
+ float aOptErr[BC7_MAX_REGIONS];
+
+ for(register size_t p = 0; p <= uPartitions; p++)
+ {
+ aOrgEndPts[p].A = Quantize(aEndPts[p].A, ms_aInfo[pEP->uMode].RGBAPrecWithP);
+ aOrgEndPts[p].B = Quantize(aEndPts[p].B, ms_aInfo[pEP->uMode].RGBAPrecWithP);
+ }
+
+ AssignIndices(pEP, uShape, uIndexMode, aOrgEndPts, aOrgIdx, aOrgIdx2, aOrgErr);
+ OptimizeEndPoints(pEP, uShape, uIndexMode, aOrgErr, aOrgEndPts, aOptEndPts);
+ AssignIndices(pEP, uShape, uIndexMode, aOptEndPts, aOptIdx, aOptIdx2, aOptErr);
+
+ float fOrgTotErr = 0, fOptTotErr = 0;
+ for(register size_t p = 0; p <= uPartitions; p++)
+ {
+ fOrgTotErr += aOrgErr[p];
+ fOptTotErr += aOptErr[p];
+ }
+ if(fOptTotErr < fOrgTotErr)
+ {
+ EmitBlock(pEP, uShape, uRotation, uIndexMode, aOptEndPts, aOptIdx, aOptIdx2);
+ return fOptTotErr;
+ }
+ else
+ {
+ EmitBlock(pEP, uShape, uRotation, uIndexMode, aOrgEndPts, aOrgIdx, aOrgIdx2);
+ return fOrgTotErr;
+ }
+}
+
+float D3DX_BC7::MapColors(const EncodeParams* pEP, const LDRColorA aColors[], size_t np, size_t uIndexMode, const LDREndPntPair& endPts, float fMinErr) const
+{
+ assert( pEP );
+ const uint8_t uIndexPrec = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec2 : ms_aInfo[pEP->uMode].uIndexPrec;
+ const uint8_t uIndexPrec2 = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec : ms_aInfo[pEP->uMode].uIndexPrec2;
+ LDRColorA aPalette[BC7_MAX_INDICES];
+ float fTotalErr = 0;
+
+ GeneratePaletteQuantized(pEP, uIndexMode, endPts, aPalette);
+ for(register size_t i = 0; i < np; ++i)
+ {
+ fTotalErr += ComputeError(aColors[i], aPalette, uIndexPrec, uIndexPrec2);
+ if(fTotalErr > fMinErr) // check for early exit
+ {
+ fTotalErr = FLT_MAX;
+ break;
+ }
+ }
+
+ return fTotalErr;
+}
+
+float D3DX_BC7::RoughMSE(EncodeParams* pEP, size_t uShape, size_t uIndexMode)
+{
+ assert( pEP );
+ assert( uShape < BC7_MAX_SHAPES );
+ __analysis_assume( uShape < BC7_MAX_SHAPES );
+ LDREndPntPair* aEndPts = pEP->aEndPts[uShape];
+
+ const uint8_t uPartitions = ms_aInfo[pEP->uMode].uPartitions;
+ assert( uPartitions < BC7_MAX_REGIONS );
+ __analysis_assume( uPartitions < BC7_MAX_REGIONS );
+
+ const uint8_t uIndexPrec = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec2 : ms_aInfo[pEP->uMode].uIndexPrec;
+ const uint8_t uIndexPrec2 = uIndexMode ? ms_aInfo[pEP->uMode].uIndexPrec : ms_aInfo[pEP->uMode].uIndexPrec2;
+ const uint8_t uNumIndices = 1 << uIndexPrec;
+ const uint8_t uNumIndices2 = 1 << uIndexPrec2;
+ size_t auPixIdx[NUM_PIXELS_PER_BLOCK];
+ LDRColorA aPalette[BC7_MAX_REGIONS][BC7_MAX_INDICES];
+
+ for(size_t p = 0; p <= uPartitions; p++)
+ {
+ size_t np = 0;
+ for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++)
+ {
+ if (g_aPartitionTable[uPartitions][uShape][i] == p)
+ {
+ auPixIdx[np++] = i;
+ }
+ }
+
+ // handle simple cases
+ assert(np > 0);
+ if(np == 1)
+ {
+ aEndPts[p].A = pEP->aLDRPixels[auPixIdx[0]];
+ aEndPts[p].B = pEP->aLDRPixels[auPixIdx[0]];
+ continue;
+ }
+ else if(np == 2)
+ {
+ aEndPts[p].A = pEP->aLDRPixels[auPixIdx[0]];
+ aEndPts[p].B = pEP->aLDRPixels[auPixIdx[1]];
+ continue;
+ }
+
+ if(uIndexPrec2 == 0)
+ {
+ HDRColorA epA, epB;
+ OptimizeRGBA(pEP->aHDRPixels, &epA, &epB, 4, np, auPixIdx);
+ epA.Clamp(0.0f, 1.0f);
+ epB.Clamp(0.0f, 1.0f);
+ epA *= 255.0f;
+ epB *= 255.0f;
+ aEndPts[p].A = epA.ToLDRColorA();
+ aEndPts[p].B = epB.ToLDRColorA();
+ }
+ else
+ {
+ uint8_t uMinAlpha = 255, uMaxAlpha = 0;
+ for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+ {
+ uMinAlpha = std::min<uint8_t>(uMinAlpha, pEP->aLDRPixels[auPixIdx[i]].a);
+ uMaxAlpha = std::max<uint8_t>(uMaxAlpha, pEP->aLDRPixels[auPixIdx[i]].a);
+ }
+
+ HDRColorA epA, epB;
+ OptimizeRGB(pEP->aHDRPixels, &epA, &epB, 4, np, auPixIdx);
+ epA.Clamp(0.0f, 1.0f);
+ epB.Clamp(0.0f, 1.0f);
+ epA *= 255.0f;
+ epB *= 255.0f;
+ aEndPts[p].A = epA.ToLDRColorA();
+ aEndPts[p].B = epB.ToLDRColorA();
+ aEndPts[p].A.a = uMinAlpha;
+ aEndPts[p].B.a = uMaxAlpha;
+ }
+ }
+
+ if(uIndexPrec2 == 0)
+ {
+ for(size_t p = 0; p <= uPartitions; p++)
+ for(register size_t i = 0; i < uNumIndices; i++)
+ LDRColorA::Interpolate(aEndPts[p].A, aEndPts[p].B, i, i, uIndexPrec, uIndexPrec, aPalette[p][i]);
+ }
+ else
+ {
+ for(size_t p = 0; p <= uPartitions; p++)
+ {
+ for(register size_t i = 0; i < uNumIndices; i++)
+ LDRColorA::InterpolateRGB(aEndPts[p].A, aEndPts[p].B, i, uIndexPrec, aPalette[p][i]);
+ for(register size_t i = 0; i < uNumIndices2; i++)
+ LDRColorA::InterpolateA(aEndPts[p].A, aEndPts[p].B, i, uIndexPrec2, aPalette[p][i]);
+ }
+ }
+
+ float fTotalErr = 0;
+ for(register size_t i = 0; i < NUM_PIXELS_PER_BLOCK; i++)
+ {
+ uint8_t uRegion = g_aPartitionTable[uPartitions][uShape][i];
+ fTotalErr += ComputeError(pEP->aLDRPixels[i], aPalette[uRegion], uIndexPrec, uIndexPrec2);
+ }
+
+ return fTotalErr;
+}
+
+//=====================================================================================
+// Entry points
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// BC6H Compression
+//-------------------------------------------------------------------------------------
+void D3DXDecodeBC6HU(XMVECTOR *pColor, const uint8_t *pBC)
+{
+ assert( pColor && pBC );
+ static_assert( sizeof(D3DX_BC6H) == 16, "D3DX_BC6H should be 16 bytes" );
+ reinterpret_cast< const D3DX_BC6H* >( pBC )->Decode(false, reinterpret_cast<HDRColorA*>(pColor));
+}
+
+void D3DXDecodeBC6HS(XMVECTOR *pColor, const uint8_t *pBC)
+{
+ assert( pColor && pBC );
+ static_assert( sizeof(D3DX_BC6H) == 16, "D3DX_BC6H should be 16 bytes" );
+ reinterpret_cast< const D3DX_BC6H* >( pBC )->Decode(true, reinterpret_cast<HDRColorA*>(pColor));
+}
+
+void D3DXEncodeBC6HU(uint8_t *pBC, const XMVECTOR *pColor, DWORD flags)
+{
+ UNREFERENCED_PARAMETER(flags);
+ assert( pBC && pColor );
+ static_assert( sizeof(D3DX_BC6H) == 16, "D3DX_BC6H should be 16 bytes" );
+ reinterpret_cast< D3DX_BC6H* >( pBC )->Encode(false, reinterpret_cast<const HDRColorA*>(pColor));
+}
+
+void D3DXEncodeBC6HS(uint8_t *pBC, const XMVECTOR *pColor, DWORD flags)
+{
+ UNREFERENCED_PARAMETER(flags);
+ assert( pBC && pColor );
+ static_assert( sizeof(D3DX_BC6H) == 16, "D3DX_BC6H should be 16 bytes" );
+ reinterpret_cast< D3DX_BC6H* >( pBC )->Encode(true, reinterpret_cast<const HDRColorA*>(pColor));
+}
+
+
+//-------------------------------------------------------------------------------------
+// BC7 Compression
+//-------------------------------------------------------------------------------------
+void D3DXDecodeBC7(XMVECTOR *pColor, const uint8_t *pBC)
+{
+ assert( pColor && pBC );
+ static_assert( sizeof(D3DX_BC7) == 16, "D3DX_BC7 should be 16 bytes" );
+ reinterpret_cast< const D3DX_BC7* >( pBC )->Decode(reinterpret_cast<HDRColorA*>(pColor));
+}
+
+void D3DXEncodeBC7(uint8_t *pBC, const XMVECTOR *pColor, DWORD flags)
+{
+ UNREFERENCED_PARAMETER(flags);
+ assert( pBC && pColor );
+ static_assert( sizeof(D3DX_BC7) == 16, "D3DX_BC7 should be 16 bytes" );
+ reinterpret_cast< D3DX_BC7* >( pBC )->Encode(reinterpret_cast<const HDRColorA*>(pColor));
+}
+
+} // namespace \ No newline at end of file
diff --git a/thirdparty/directxtex/DirectXTex/DDS.h b/thirdparty/directxtex/DirectXTex/DDS.h
new file mode 100644
index 00000000..6e913957
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DDS.h
@@ -0,0 +1,214 @@
+//--------------------------------------------------------------------------------------
+// dds.h
+//
+// This header defines constants and structures that are useful when parsing
+// DDS files. DDS files were originally designed to use several structures
+// and constants that are native to DirectDraw and are defined in ddraw.h,
+// such as DDSURFACEDESC2 and DDSCAPS2. This file defines similar
+// (compatible) constants and structures so that one can use DDS files
+// without needing to include ddraw.h.
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//--------------------------------------------------------------------------------------
+
+#if defined(_MSC_VER) && (_MSC_VER > 1000)
+#pragma once
+#endif
+
+#include <dxgiformat.h>
+
+#pragma warning(push)
+#pragma warning(disable : 4005)
+#include <stdint.h>
+#pragma warning(pop)
+
+namespace DirectX
+{
+
+#pragma pack(push,1)
+
+const uint32_t DDS_MAGIC = 0x20534444; // "DDS "
+
+struct DDS_PIXELFORMAT
+{
+ uint32_t dwSize;
+ uint32_t dwFlags;
+ uint32_t dwFourCC;
+ uint32_t dwRGBBitCount;
+ uint32_t dwRBitMask;
+ uint32_t dwGBitMask;
+ uint32_t dwBBitMask;
+ uint32_t dwABitMask;
+};
+
+#define DDS_FOURCC 0x00000004 // DDPF_FOURCC
+#define DDS_RGB 0x00000040 // DDPF_RGB
+#define DDS_RGBA 0x00000041 // DDPF_RGB | DDPF_ALPHAPIXELS
+#define DDS_LUMINANCE 0x00020000 // DDPF_LUMINANCE
+#define DDS_LUMINANCEA 0x00020001 // DDPF_LUMINANCE | DDPF_ALPHAPIXELS
+#define DDS_ALPHA 0x00000002 // DDPF_ALPHA
+#define DDS_PAL8 0x00000020 // DDPF_PALETTEINDEXED8
+
+#ifndef MAKEFOURCC
+ #define MAKEFOURCC(ch0, ch1, ch2, ch3) \
+ ((uint32_t)(uint8_t)(ch0) | ((uint32_t)(uint8_t)(ch1) << 8) | \
+ ((uint32_t)(uint8_t)(ch2) << 16) | ((uint32_t)(uint8_t)(ch3) << 24 ))
+#endif /* defined(MAKEFOURCC) */
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_DXT1 =
+ { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('D','X','T','1'), 0, 0, 0, 0, 0 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_DXT2 =
+ { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('D','X','T','2'), 0, 0, 0, 0, 0 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_DXT3 =
+ { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('D','X','T','3'), 0, 0, 0, 0, 0 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_DXT4 =
+ { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('D','X','T','4'), 0, 0, 0, 0, 0 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_DXT5 =
+ { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('D','X','T','5'), 0, 0, 0, 0, 0 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_BC4_UNORM =
+ { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('B','C','4','U'), 0, 0, 0, 0, 0 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_BC4_SNORM =
+ { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('B','C','4','S'), 0, 0, 0, 0, 0 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_BC5_UNORM =
+ { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('B','C','5','U'), 0, 0, 0, 0, 0 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_BC5_SNORM =
+ { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('B','C','5','S'), 0, 0, 0, 0, 0 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_R8G8_B8G8 =
+ { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('R','G','B','G'), 0, 0, 0, 0, 0 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_G8R8_G8B8 =
+ { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('G','R','G','B'), 0, 0, 0, 0, 0 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_A8R8G8B8 =
+ { sizeof(DDS_PIXELFORMAT), DDS_RGBA, 0, 32, 0x00ff0000, 0x0000ff00, 0x000000ff, 0xff000000 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_X8R8G8B8 =
+ { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 32, 0x00ff0000, 0x0000ff00, 0x000000ff, 0x00000000 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_A8B8G8R8 =
+ { sizeof(DDS_PIXELFORMAT), DDS_RGBA, 0, 32, 0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_X8B8G8R8 =
+ { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 32, 0x000000ff, 0x0000ff00, 0x00ff0000, 0x00000000 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_G16R16 =
+ { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 32, 0x0000ffff, 0xffff0000, 0x00000000, 0x00000000 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_R5G6B5 =
+ { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 16, 0x0000f800, 0x000007e0, 0x0000001f, 0x00000000 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_A1R5G5B5 =
+ { sizeof(DDS_PIXELFORMAT), DDS_RGBA, 0, 16, 0x00007c00, 0x000003e0, 0x0000001f, 0x00008000 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_A4R4G4B4 =
+ { sizeof(DDS_PIXELFORMAT), DDS_RGBA, 0, 16, 0x00000f00, 0x000000f0, 0x0000000f, 0x0000f000 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_R8G8B8 =
+ { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 24, 0x00ff0000, 0x0000ff00, 0x000000ff, 0x00000000 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_L8 =
+ { sizeof(DDS_PIXELFORMAT), DDS_LUMINANCE, 0, 8, 0xff, 0x00, 0x00, 0x00 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_L16 =
+ { sizeof(DDS_PIXELFORMAT), DDS_LUMINANCE, 0, 16, 0xffff, 0x0000, 0x0000, 0x0000 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_A8L8 =
+ { sizeof(DDS_PIXELFORMAT), DDS_LUMINANCEA, 0, 16, 0x00ff, 0x0000, 0x0000, 0xff00 };
+
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_A8 =
+ { sizeof(DDS_PIXELFORMAT), DDS_ALPHA, 0, 8, 0x00, 0x00, 0x00, 0xff };
+
+// D3DFMT_A2R10G10B10/D3DFMT_A2B10G10R10 should be written using DX10 extension to avoid D3DX 10:10:10:2 reversal issue
+
+// This indicates the DDS_HEADER_DXT10 extension is present (the format is in dxgiFormat)
+extern __declspec(selectany) const DDS_PIXELFORMAT DDSPF_DX10 =
+ { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC('D','X','1','0'), 0, 0, 0, 0, 0 };
+
+#define DDS_HEADER_FLAGS_TEXTURE 0x00001007 // DDSD_CAPS | DDSD_HEIGHT | DDSD_WIDTH | DDSD_PIXELFORMAT
+#define DDS_HEADER_FLAGS_MIPMAP 0x00020000 // DDSD_MIPMAPCOUNT
+#define DDS_HEADER_FLAGS_VOLUME 0x00800000 // DDSD_DEPTH
+#define DDS_HEADER_FLAGS_PITCH 0x00000008 // DDSD_PITCH
+#define DDS_HEADER_FLAGS_LINEARSIZE 0x00080000 // DDSD_LINEARSIZE
+
+#define DDS_HEIGHT 0x00000002 // DDSD_HEIGHT
+#define DDS_WIDTH 0x00000004 // DDSD_WIDTH
+
+#define DDS_SURFACE_FLAGS_TEXTURE 0x00001000 // DDSCAPS_TEXTURE
+#define DDS_SURFACE_FLAGS_MIPMAP 0x00400008 // DDSCAPS_COMPLEX | DDSCAPS_MIPMAP
+#define DDS_SURFACE_FLAGS_CUBEMAP 0x00000008 // DDSCAPS_COMPLEX
+
+#define DDS_CUBEMAP_POSITIVEX 0x00000600 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_POSITIVEX
+#define DDS_CUBEMAP_NEGATIVEX 0x00000a00 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_NEGATIVEX
+#define DDS_CUBEMAP_POSITIVEY 0x00001200 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_POSITIVEY
+#define DDS_CUBEMAP_NEGATIVEY 0x00002200 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_NEGATIVEY
+#define DDS_CUBEMAP_POSITIVEZ 0x00004200 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_POSITIVEZ
+#define DDS_CUBEMAP_NEGATIVEZ 0x00008200 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_NEGATIVEZ
+
+#define DDS_CUBEMAP_ALLFACES ( DDS_CUBEMAP_POSITIVEX | DDS_CUBEMAP_NEGATIVEX |\
+ DDS_CUBEMAP_POSITIVEY | DDS_CUBEMAP_NEGATIVEY |\
+ DDS_CUBEMAP_POSITIVEZ | DDS_CUBEMAP_NEGATIVEZ )
+
+#define DDS_CUBEMAP 0x00000200 // DDSCAPS2_CUBEMAP
+
+#define DDS_FLAGS_VOLUME 0x00200000 // DDSCAPS2_VOLUME
+
+// Subset here matches D3D10_RESOURCE_DIMENSION and D3D11_RESOURCE_DIMENSION
+typedef enum DDS_RESOURCE_DIMENSION
+{
+ DDS_DIMENSION_TEXTURE1D = 2,
+ DDS_DIMENSION_TEXTURE2D = 3,
+ DDS_DIMENSION_TEXTURE3D = 4,
+} DDS_RESOURCE_DIMENSION;
+
+// Subset here matches D3D10_RESOURCE_MISC_FLAG and D3D11_RESOURCE_MISC_FLAG
+typedef enum DDS_RESOURCE_MISC_FLAG
+{
+ DDS_RESOURCE_MISC_TEXTURECUBE = 0x4L,
+} DDS_RESOURCE_MISC_FLAG;
+
+typedef struct
+{
+ uint32_t dwSize;
+ uint32_t dwFlags;
+ uint32_t dwHeight;
+ uint32_t dwWidth;
+ uint32_t dwPitchOrLinearSize;
+ uint32_t dwDepth; // only if DDS_HEADER_FLAGS_VOLUME is set in dwFlags
+ uint32_t dwMipMapCount;
+ uint32_t dwReserved1[11];
+ DDS_PIXELFORMAT ddspf;
+ uint32_t dwCaps;
+ uint32_t dwCaps2;
+ uint32_t dwCaps3;
+ uint32_t dwCaps4;
+ uint32_t dwReserved2;
+} DDS_HEADER;
+
+typedef struct
+{
+ DXGI_FORMAT dxgiFormat;
+ uint32_t resourceDimension;
+ uint32_t miscFlag; // see DDS_RESOURCE_MISC_FLAG
+ uint32_t arraySize;
+ uint32_t reserved;
+} DDS_HEADER_DXT10;
+
+#pragma pack(pop)
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTex.h b/thirdparty/directxtex/DirectXTex/DirectXTex.h
new file mode 100644
index 00000000..c4d4b73c
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTex.h
@@ -0,0 +1,466 @@
+//-------------------------------------------------------------------------------------
+// DirectXTex.h
+//
+// DirectX Texture Library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#if defined(_MSC_VER) && (_MSC_VER > 1000)
+#pragma once
+#endif
+
+#pragma warning(push)
+#pragma warning(disable : 4005)
+#include <stdint.h>
+#pragma warning(pop)
+
+#include <algorithm>
+
+#include <dxgiformat.h>
+#include <d3d11.h>
+
+#define DIRECTX_TEX_VERSION 100
+
+namespace DirectX
+{
+ //---------------------------------------------------------------------------------
+ // DXGI Format Utilities
+ bool IsValid( _In_ DXGI_FORMAT fmt );
+ bool IsCompressed( _In_ DXGI_FORMAT fmt );
+ bool IsPacked( _In_ DXGI_FORMAT fmt );
+ bool IsVideo( _In_ DXGI_FORMAT fmt );
+ bool IsSRGB( _In_ DXGI_FORMAT fmt );
+ bool IsTypeless( _In_ DXGI_FORMAT fmt );
+
+ size_t BitsPerPixel( _In_ DXGI_FORMAT fmt );
+
+ enum CP_FLAGS
+ {
+ CP_FLAGS_NONE = 0x0, // Normal operation
+ CP_FLAGS_LEGACY_DWORD = 0x1, // Assume pitch is DWORD aligned instead of BYTE aligned
+ CP_FLAGS_24BPP = 0x10000, // Override with a legacy 24 bits-per-pixel format size
+ CP_FLAGS_16BPP = 0x20000, // Override with a legacy 16 bits-per-pixel format size
+ CP_FLAGS_8BPP = 0x40000, // Override with a legacy 8 bits-per-pixel format size
+ };
+
+ void ComputePitch( _In_ DXGI_FORMAT fmt, _In_ size_t width, _In_ size_t height,
+ _Out_ size_t& rowPitch, _Out_ size_t& slicePitch, _In_ DWORD flags = CP_FLAGS_NONE );
+
+ size_t ComputeScanlines( _In_ DXGI_FORMAT fmt, _In_ size_t height );
+
+ DXGI_FORMAT MakeSRGB( _In_ DXGI_FORMAT fmt );
+ DXGI_FORMAT MakeTypeless( _In_ DXGI_FORMAT fmt );
+ DXGI_FORMAT MakeTypelessUNORM( _In_ DXGI_FORMAT fmt );
+ DXGI_FORMAT MakeTypelessFLOAT( _In_ DXGI_FORMAT fmt );
+
+ //---------------------------------------------------------------------------------
+ // Texture metadata
+ enum TEX_DIMENSION
+ // Subset here matches D3D10_RESOURCE_DIMENSION and D3D11_RESOURCE_DIMENSION
+ {
+ TEX_DIMENSION_TEXTURE1D = 2,
+ TEX_DIMENSION_TEXTURE2D = 3,
+ TEX_DIMENSION_TEXTURE3D = 4,
+ };
+
+ enum TEX_MISC_FLAG
+ // Subset here matches D3D10_RESOURCE_MISC_FLAG and D3D11_RESOURCE_MISC_FLAG
+ {
+ TEX_MISC_TEXTURECUBE = 0x4L,
+ };
+
+ struct TexMetadata
+ {
+ size_t width;
+ size_t height; // Should be 1 for 1D textures
+ size_t depth; // Should be 1 for 1D or 2D textures
+ size_t arraySize; // For cubemap, this is a multiple of 6
+ size_t mipLevels;
+ uint32_t miscFlags;
+ DXGI_FORMAT format;
+ TEX_DIMENSION dimension;
+
+ size_t ComputeIndex( _In_ size_t mip, _In_ size_t item, _In_ size_t slice ) const;
+ // Returns size_t(-1) to indicate an out-of-range error
+ };
+
+ enum DDS_FLAGS
+ {
+ DDS_FLAGS_NONE = 0x0,
+
+ DDS_FLAGS_LEGACY_DWORD = 0x1,
+ // Assume pitch is DWORD aligned instead of BYTE aligned (used by some legacy DDS files)
+
+ DDS_FLAGS_NO_LEGACY_EXPANSION = 0x2,
+ // Do not implicitly convert legacy formats that result in larger pixel sizes (24 bpp, 3:3:2, A8L8, A4L4, P8, A8P8)
+
+ DDS_FLAGS_NO_R10B10G10A2_FIXUP = 0x4,
+ // Do not use work-around for long-standing D3DX DDS file format issue which reversed the 10:10:10:2 color order masks
+
+ DDS_FLAGS_FORCE_RGB = 0x8,
+ // Convert DXGI 1.1 BGR formats to DXGI_FORMAT_R8G8B8A8_UNORM to avoid use of optional WDDM 1.1 formats
+
+ DDS_FLAGS_NO_16BPP = 0x10,
+ // Conversions avoid use of 565, 5551, and 4444 formats and instead expand to 8888 to avoid use of optional WDDM 1.2 formats
+
+ DDS_FLAGS_FORCE_DX10_EXT = 0x10000,
+ // Always use the 'DX10' header extension for DDS writer (i.e. don't try to write DX9 compatible DDS files)
+ };
+
+ enum WIC_FLAGS
+ {
+ WIC_FLAGS_NONE = 0x0,
+
+ WIC_FLAGS_FORCE_RGB = 0x1,
+ // Loads DXGI 1.1 BGR formats as DXGI_FORMAT_R8G8B8A8_UNORM to avoid use of optional WDDM 1.1 formats
+
+ WIC_FLAGS_NO_X2_BIAS = 0x2,
+ // Loads DXGI 1.1 X2 10:10:10:2 format as DXGI_FORMAT_R10G10B10A2_UNORM
+
+ WIC_FLAGS_NO_16BPP = 0x4,
+ // Loads 565, 5551, and 4444 formats as 8888 to avoid use of optional WDDM 1.2 formats
+
+ WIC_FLAGS_ALLOW_MONO = 0x8,
+ // Loads 1-bit monochrome (black & white) as R1_UNORM rather than 8-bit greyscale
+
+ WIC_FLAGS_ALL_FRAMES = 0x10,
+ // Loads all images in a multi-frame file, converting/resizing to match the first frame as needed, defaults to 0th frame otherwise
+
+ WIC_FLAGS_DITHER = 0x10000,
+ // Use ordered 4x4 dithering for any required conversions
+
+ WIC_FLAGS_DITHER_DIFFUSION = 0x20000,
+ // Use error-diffusion dithering for any required conversions
+
+ WIC_FLAGS_FILTER_POINT = 0x100000,
+ WIC_FLAGS_FILTER_LINEAR = 0x200000,
+ WIC_FLAGS_FILTER_CUBIC = 0x300000,
+ WIC_FLAGS_FILTER_FANT = 0x400000, // Combination of Linear and Box filter
+ // Filtering mode to use for any required image resizing (only needed when loading arrays of differently sized images; defaults to Fant)
+ };
+
+ HRESULT GetMetadataFromDDSMemory( _In_bytecount_(size) LPCVOID pSource, _In_ size_t size, _In_ DWORD flags,
+ _Out_ TexMetadata& metadata );
+ HRESULT GetMetadataFromDDSFile( _In_z_ LPCWSTR szFile, DWORD flags,
+ _Out_ TexMetadata& metadata );
+
+ HRESULT GetMetadataFromTGAMemory( _In_bytecount_(size) LPCVOID pSource, _In_ size_t size,
+ _Out_ TexMetadata& metadata );
+ HRESULT GetMetadataFromTGAFile( _In_z_ LPCWSTR szFile,
+ _Out_ TexMetadata& metadata );
+
+ HRESULT GetMetadataFromWICMemory( _In_bytecount_(size) LPCVOID pSource, _In_ size_t size, _In_ DWORD flags,
+ _Out_ TexMetadata& metadata );
+ HRESULT GetMetadataFromWICFile( _In_z_ LPCWSTR szFile, _In_ DWORD flags,
+ _Out_ TexMetadata& metadata );
+
+ //---------------------------------------------------------------------------------
+ // Bitmap image container
+ struct Image
+ {
+ size_t width;
+ size_t height;
+ DXGI_FORMAT format;
+ size_t rowPitch;
+ size_t slicePitch;
+ uint8_t* pixels;
+ };
+
+ class ScratchImage
+ {
+ public:
+ ScratchImage() : _nimages(0), _size(0), _image(0), _memory(0) {}
+ ~ScratchImage() { Release(); }
+
+ HRESULT Initialize( _In_ const TexMetadata& mdata );
+
+ HRESULT Initialize1D( _In_ DXGI_FORMAT fmt, _In_ size_t length, _In_ size_t arraySize, _In_ size_t mipLevels );
+ HRESULT Initialize2D( _In_ DXGI_FORMAT fmt, _In_ size_t width, _In_ size_t height, _In_ size_t arraySize, _In_ size_t mipLevels );
+ HRESULT Initialize3D( _In_ DXGI_FORMAT fmt, _In_ size_t width, _In_ size_t height, _In_ size_t depth, _In_ size_t mipLevels );
+ HRESULT InitializeCube( _In_ DXGI_FORMAT fmt, _In_ size_t width, _In_ size_t height, _In_ size_t nCubes, _In_ size_t mipLevels );
+
+ HRESULT InitializeFromImage( _In_ const Image& srcImage, _In_ bool allow1D = false );
+ HRESULT InitializeArrayFromImages( _In_count_(nImages) const Image* images, _In_ size_t nImages, _In_ bool allow1D = false );
+ HRESULT InitializeCubeFromImages( _In_count_(nImages) const Image* images, _In_ size_t nImages );
+ HRESULT Initialize3DFromImages( _In_count_(depth) const Image* images, _In_ size_t depth );
+
+ void Release();
+
+ bool OverrideFormat( _In_ DXGI_FORMAT f );
+
+ const TexMetadata& GetMetadata() const { return _metadata; }
+ const Image* GetImage(_In_ size_t mip, _In_ size_t item, _In_ size_t slice) const;
+
+ const Image* GetImages() const { return _image; }
+ size_t GetImageCount() const { return _nimages; }
+
+ uint8_t* GetPixels() const { return _memory; }
+ size_t GetPixelsSize() const { return _size; }
+
+ private:
+ size_t _nimages;
+ size_t _size;
+ TexMetadata _metadata;
+ Image* _image;
+ uint8_t* _memory;
+
+ // Hide copy constructor and assignment operator
+ ScratchImage( const ScratchImage& );
+ ScratchImage& operator=( const ScratchImage& );
+ };
+
+ //---------------------------------------------------------------------------------
+ // Memory blob (allocated buffer pointer is always 16-byte aligned)
+ class Blob
+ {
+ public:
+ Blob() : _buffer(0), _size(0) {}
+ ~Blob() { Release(); }
+
+ HRESULT Initialize( _In_ size_t size );
+
+ void Release();
+
+ void *GetBufferPointer() const { return _buffer; }
+ size_t GetBufferSize() const { return _size; }
+
+ private:
+ void* _buffer;
+ size_t _size;
+
+ // Hide copy constructor and assignment operator
+ Blob( const Blob& );
+ Blob& operator=( const Blob& );
+ };
+
+ //---------------------------------------------------------------------------------
+ // Image I/O
+
+ // DDS operations
+ HRESULT LoadFromDDSMemory( _In_bytecount_(size) LPCVOID pSource, _In_ size_t size, _In_ DWORD flags,
+ _Out_opt_ TexMetadata* metadata, _Out_ ScratchImage& image );
+ HRESULT LoadFromDDSFile( _In_z_ LPCWSTR szFile, _In_ DWORD flags,
+ _Out_opt_ TexMetadata* metadata, _Out_ ScratchImage& image );
+
+ HRESULT SaveToDDSMemory( _In_ const Image& image, _In_ DWORD flags,
+ _Out_ Blob& blob );
+ HRESULT SaveToDDSMemory( _In_count_(nimages) const Image* images, _In_ size_t nimages, _In_ const TexMetadata& metadata, _In_ DWORD flags,
+ _Out_ Blob& blob );
+
+ HRESULT SaveToDDSFile( _In_ const Image& image, _In_ DWORD flags, _In_z_ LPCWSTR szFile );
+ HRESULT SaveToDDSFile( _In_count_(nimages) const Image* images, _In_ size_t nimages, _In_ const TexMetadata& metadata, _In_ DWORD flags, _In_z_ LPCWSTR szFile );
+
+ // TGA operations
+ HRESULT LoadFromTGAMemory( _In_bytecount_(size) LPCVOID pSource, _In_ size_t size,
+ _Out_opt_ TexMetadata* metadata, _Out_ ScratchImage& image );
+ HRESULT LoadFromTGAFile( _In_z_ LPCWSTR szFile,
+ _Out_opt_ TexMetadata* metadata, _Out_ ScratchImage& image );
+
+ HRESULT SaveToTGAMemory( _In_ const Image& image, _Out_ Blob& blob );
+ HRESULT SaveToTGAFile( _In_ const Image& image, _In_z_ LPCWSTR szFile );
+
+ // WIC operations
+ HRESULT LoadFromWICMemory( _In_bytecount_(size) LPCVOID pSource, _In_ size_t size, _In_ DWORD flags,
+ _Out_opt_ TexMetadata* metadata, _Out_ ScratchImage& image );
+ HRESULT LoadFromWICFile( _In_z_ LPCWSTR szFile, _In_ DWORD flags,
+ _Out_opt_ TexMetadata* metadata, _Out_ ScratchImage& image );
+
+ HRESULT SaveToWICMemory( _In_ const Image& image, _In_ DWORD flags, _In_ REFGUID guidContainerFormat,
+ _Out_ Blob& blob, _In_opt_ const GUID* targetFormat = nullptr );
+ HRESULT SaveToWICMemory( _In_count_(nimages) const Image* images, _In_ size_t nimages, _In_ DWORD flags, _In_ REFGUID guidContainerFormat,
+ _Out_ Blob& blob, _In_opt_ const GUID* targetFormat = nullptr );
+
+ HRESULT SaveToWICFile( _In_ const Image& image, _In_ DWORD flags, _In_ REFGUID guidContainerFormat,
+ _In_z_ LPCWSTR szFile, _In_opt_ const GUID* targetFormat = nullptr );
+ HRESULT SaveToWICFile( _In_count_(nimages) const Image* images, _In_ size_t nimages, _In_ DWORD flags, _In_ REFGUID guidContainerFormat,
+ _In_z_ LPCWSTR szFile, _In_opt_ const GUID* targetFormat = nullptr );
+
+ enum WICCodecs
+ {
+ WIC_CODEC_BMP =1, // Windows Bitmap (.bmp)
+ WIC_CODEC_JPEG, // Joint Photographic Experts Group (.jpg, .jpeg)
+ WIC_CODEC_PNG, // Portable Network Graphics (.png)
+ WIC_CODEC_TIFF, // Tagged Image File Format (.tif, .tiff)
+ WIC_CODEC_GIF, // Graphics Interchange Format (.gif)
+ WIC_CODEC_WMP, // Windows Media Photo / HD Photo / JPEG XR (.hdp, .jxr, .wdp)
+ WIC_CODEC_ICO, // Windows Icon (.ico)
+ };
+
+ REFGUID GetWICCodec( _In_ WICCodecs codec );
+
+ //---------------------------------------------------------------------------------
+ // Texture conversion, resizing, mipmap generation, and block compression
+
+ enum TEX_FR_FLAGS
+ {
+ TEX_FR_ROTATE0 = 0x0,
+ TEX_FR_ROTATE90 = 0x1,
+ TEX_FR_ROTATE180 = 0x2,
+ TEX_FR_ROTATE270 = 0x3,
+ TEX_FR_FLIP_HORIZONTAL = 0x08,
+ TEX_FR_FLIP_VERTICAL = 0x10,
+ };
+
+ HRESULT FlipRotate( _In_ const Image& srcImage, _In_ DWORD flags, _Out_ ScratchImage& image );
+ HRESULT FlipRotate( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata,
+ _In_ DWORD flags, _Out_ ScratchImage& result );
+ // Flip and/or rotate image
+
+ enum TEX_FILTER_FLAGS
+ {
+ TEX_FILTER_DEFAULT = 0,
+
+ // Clamp filtering only
+
+ TEX_FILTER_SEPARATE_ALPHA = 0x100,
+ // Resize color and alpha channel independently
+
+ TEX_FILTER_DITHER = 0x10000,
+ // Use ordered 4x4 dithering for any required conversions
+ TEX_FILTER_DITHER_DIFFUSION = 0x20000,
+ // Use error-diffusion dithering for any required conversions
+
+ TEX_FILTER_POINT = 0x100000,
+ TEX_FILTER_LINEAR = 0x200000,
+ TEX_FILTER_CUBIC = 0x300000,
+ TEX_FILTER_FANT = 0x400000, // Equiv to Box filtering for mipmap generation
+ // Filtering mode to use for any required image resizing
+
+ TEX_FILTER_SRGB_IN = 0x1000000,
+ TEX_FILTER_SRGB_OUT = 0x2000000,
+ TEX_FILTER_SRGB = 0x3000000,
+ // sRGB <-> RGB for use in conversion operations
+ // if the input format type is IsSRGB(), then SRGB_IN is on by default
+ // if the output format type is IsSRGB(), then SRGB_OUT is on by default
+ };
+
+ HRESULT Resize( _In_ const Image& srcImage, _In_ size_t width, _In_ size_t height, _In_ DWORD filter,
+ _Out_ ScratchImage& image );
+ HRESULT Resize( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata,
+ _In_ size_t width, _In_ size_t height, _In_ DWORD filter, _Out_ ScratchImage& result );
+ // Resize the image to width x height. Defaults to Fant filtering.
+ // Note for a complex resize, the result will always have mipLevels == 1
+
+ HRESULT Convert( _In_ const Image& srcImage, _In_ DXGI_FORMAT format, _In_ DWORD filter, _In_ float threshold,
+ _Out_ ScratchImage& image );
+ HRESULT Convert( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata,
+ _In_ DXGI_FORMAT format, _In_ DWORD filter, _In_ float threshold, _Out_ ScratchImage& result );
+ // Convert the image to a new format
+
+ HRESULT GenerateMipMaps( _In_ const Image& baseImage, _In_ DWORD filter, _In_ size_t levels,
+ _Out_ ScratchImage& mipChain, bool allow1D = false );
+ HRESULT GenerateMipMaps( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata,
+ _In_ DWORD filter, _In_ size_t levels, _Out_ ScratchImage& mipChain );
+ // levels of '0' indicates a full mipchain, otherwise is generates that number of total levels (including the source base image)
+ // Defaults to Fant filtering which is equivalent to a box filter
+
+ HRESULT GenerateMipMaps3D( _In_count_(depth) const Image* baseImages, _In_ size_t depth, _In_ DWORD filter, _In_ size_t levels,
+ _Out_ ScratchImage& mipChain );
+ HRESULT GenerateMipMaps3D( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata,
+ _In_ DWORD filter, _In_ size_t levels, _Out_ ScratchImage& mipChain );
+ // levels of '0' indicates a full mipchain, otherwise is generates that number of total levels (including the source base image)
+ // Defaults to Fant filtering which is equivalent to a box filter
+
+ enum TEX_COMPRESS_FLAGS
+ {
+ TEX_COMPRESS_DEFAULT = 0,
+
+ TEX_COMPRESS_RGB_DITHER = 0x10000,
+ // Enables dithering RGB colors for BC1-3 compression
+
+ TEX_COMPRESS_A_DITHER = 0x20000,
+ // Enables dithering alpha for BC1-3 compression
+
+ TEX_COMPRESS_DITHER = 0x30000,
+ // Enables both RGB and alpha dithering for BC1-3 compression
+
+ TEX_COMPRESS_UNIFORM = 0x40000,
+ // Uniform color weighting for BC1-3 compression; by default uses perceptual weighting
+
+ TEX_COMPRESS_PARALLEL = 0x10000000,
+ // Compress is free to use multithreading to improve performance (by default it does not use multithreading)
+ };
+
+ HRESULT Compress( _In_ const Image& srcImage, _In_ DXGI_FORMAT format, _In_ DWORD compress, _In_ float alphaRef,
+ _Out_ ScratchImage& cImage );
+ HRESULT Compress( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata,
+ _In_ DXGI_FORMAT format, _In_ DWORD compress, _In_ float alphaRef, _Out_ ScratchImage& cImages );
+ // Note that alphaRef is only used by BC1. 0.5f is a typical value to use
+
+ HRESULT Decompress( _In_ const Image& cImage, _In_ DXGI_FORMAT format, _Out_ ScratchImage& image );
+ HRESULT Decompress( _In_count_(nimages) const Image* cImages, _In_ size_t nimages, _In_ const TexMetadata& metadata,
+ _In_ DXGI_FORMAT format, _Out_ ScratchImage& images );
+
+ //---------------------------------------------------------------------------------
+ // Normal map operations
+
+ enum CNMAP_FLAGS
+ {
+ CNMAP_DEFAULT = 0,
+
+ CNMAP_CHANNEL_RED = 0x1,
+ CNMAP_CHANNEL_GREEN = 0x2,
+ CNMAP_CHANNEL_BLUE = 0x3,
+ CNMAP_CHANNEL_ALPHA = 0x4,
+ CNMAP_CHANNEL_LUMINANCE = 0x5,
+ // Channel selection when evaluting color value for height
+ // Luminance is a combination of red, green, and blue
+
+ CNMAP_MIRROR_U = 0x1000,
+ CNMAP_MIRROR_V = 0x2000,
+ CNMAP_MIRROR = 0x3000,
+ // Use mirror semantics for scanline references (defaults to wrap)
+
+ CNMAP_INVERT_SIGN = 0x4000,
+ // Inverts normal sign
+
+ CNMAP_COMPUTE_OCCLUSION = 0x8000,
+ // Computes a crude occlusion term stored in the alpha channel
+ };
+
+ HRESULT ComputeNormalMap( _In_ const Image& srcImage, _In_ DWORD flags, _In_ float amplitude,
+ _In_ DXGI_FORMAT format, _Out_ ScratchImage& normalMap );
+ HRESULT ComputeNormalMap( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata,
+ _In_ DWORD flags, _In_ float amplitude, _In_ DXGI_FORMAT format, _Out_ ScratchImage& normalMaps );
+
+ //---------------------------------------------------------------------------------
+ // Misc image operations
+ struct Rect
+ {
+ size_t x;
+ size_t y;
+ size_t w;
+ size_t h;
+
+ Rect() {}
+ Rect( size_t _x, size_t _y, size_t _w, size_t _h ) : x(_x), y(_y), w(_w), h(_h) {}
+ };
+
+ HRESULT CopyRectangle( _In_ const Image& srcImage, _In_ const Rect& srcRect, _In_ const Image& dstImage,
+ _In_ DWORD filter, _In_ size_t xOffset, _In_ size_t yOffset );
+
+ HRESULT ComputeMSE( _In_ const Image& image1, _In_ const Image& image2, _Out_ float& mse, _Out_opt_cap_c_(4) float* mseV );
+
+ //---------------------------------------------------------------------------------
+ // Direct3D 11 functions
+ bool IsSupportedTexture( _In_ ID3D11Device* pDevice, _In_ const TexMetadata& metadata );
+
+ HRESULT CreateTexture( _In_ ID3D11Device* pDevice, _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata,
+ _Deref_out_ ID3D11Resource** ppResource );
+
+ HRESULT CreateShaderResourceView( _In_ ID3D11Device* pDevice, _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata,
+ _Deref_out_ ID3D11ShaderResourceView** ppSRV );
+
+ HRESULT CaptureTexture( _In_ ID3D11Device* pDevice, _In_ ID3D11DeviceContext* pContext, _In_ ID3D11Resource* pSource, _Out_ ScratchImage& result );
+
+#include "DirectXTex.inl"
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTex.inl b/thirdparty/directxtex/DirectXTex/DirectXTex.inl
new file mode 100644
index 00000000..909cd402
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTex.inl
@@ -0,0 +1,223 @@
+//-------------------------------------------------------------------------------------
+// DirectXTex.inl
+//
+// DirectX Texture Library
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#if defined(_MSC_VER) && (_MSC_VER > 1000)
+#pragma once
+#endif
+
+//=====================================================================================
+// DXGI Format Utilities
+//=====================================================================================
+
+inline bool IsValid( DXGI_FORMAT fmt )
+{
+#ifdef DXGI_1_2_FORMATS
+ return ( static_cast<size_t>(fmt) >= 1 && static_cast<size_t>(fmt) <= 115 );
+#else
+ return ( static_cast<size_t>(fmt) >= 1 && static_cast<size_t>(fmt) <= 99 );
+#endif
+}
+
+inline bool IsCompressed( DXGI_FORMAT fmt )
+{
+ switch ( fmt )
+ {
+ case DXGI_FORMAT_BC1_TYPELESS:
+ case DXGI_FORMAT_BC1_UNORM:
+ case DXGI_FORMAT_BC1_UNORM_SRGB:
+ case DXGI_FORMAT_BC2_TYPELESS:
+ case DXGI_FORMAT_BC2_UNORM:
+ case DXGI_FORMAT_BC2_UNORM_SRGB:
+ case DXGI_FORMAT_BC3_TYPELESS:
+ case DXGI_FORMAT_BC3_UNORM:
+ case DXGI_FORMAT_BC3_UNORM_SRGB:
+ case DXGI_FORMAT_BC4_TYPELESS:
+ case DXGI_FORMAT_BC4_UNORM:
+ case DXGI_FORMAT_BC4_SNORM:
+ case DXGI_FORMAT_BC5_TYPELESS:
+ case DXGI_FORMAT_BC5_UNORM:
+ case DXGI_FORMAT_BC5_SNORM:
+ case DXGI_FORMAT_BC6H_TYPELESS:
+ case DXGI_FORMAT_BC6H_UF16:
+ case DXGI_FORMAT_BC6H_SF16:
+ case DXGI_FORMAT_BC7_TYPELESS:
+ case DXGI_FORMAT_BC7_UNORM:
+ case DXGI_FORMAT_BC7_UNORM_SRGB:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+inline bool IsPacked( DXGI_FORMAT fmt )
+{
+ return ( (fmt == DXGI_FORMAT_R8G8_B8G8_UNORM) || (fmt == DXGI_FORMAT_G8R8_G8B8_UNORM) );
+}
+
+inline bool IsVideo( DXGI_FORMAT fmt )
+{
+#ifdef DXGI_1_2_FORMATS
+ switch ( fmt )
+ {
+ case DXGI_FORMAT_AYUV:
+ case DXGI_FORMAT_Y410:
+ case DXGI_FORMAT_Y416:
+ case DXGI_FORMAT_NV12:
+ case DXGI_FORMAT_P010:
+ case DXGI_FORMAT_P016:
+ case DXGI_FORMAT_YUY2:
+ case DXGI_FORMAT_Y210:
+ case DXGI_FORMAT_Y216:
+ case DXGI_FORMAT_NV11:
+ // These video formats can be used with the 3D pipeline through special view mappings
+ return true;
+
+ case DXGI_FORMAT_420_OPAQUE:
+ case DXGI_FORMAT_AI44:
+ case DXGI_FORMAT_IA44:
+ case DXGI_FORMAT_P8:
+ case DXGI_FORMAT_A8P8:
+ // These are limited use video formats not usable in any way by the 3D pipeline
+ return true;
+
+ default:
+ return false;
+ }
+#else // !DXGI_1_2_FORMATS
+ UNREFERENCED_PARAMETER(fmt);
+ return false;
+#endif
+}
+
+inline bool IsSRGB( DXGI_FORMAT fmt )
+{
+ switch( fmt )
+ {
+ case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
+ case DXGI_FORMAT_BC1_UNORM_SRGB:
+ case DXGI_FORMAT_BC2_UNORM_SRGB:
+ case DXGI_FORMAT_BC3_UNORM_SRGB:
+ case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+ case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
+ case DXGI_FORMAT_BC7_UNORM_SRGB:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+inline bool IsTypeless( DXGI_FORMAT fmt )
+{
+ switch( fmt )
+ {
+ case DXGI_FORMAT_R32G32B32A32_TYPELESS:
+ case DXGI_FORMAT_R32G32B32_TYPELESS:
+ case DXGI_FORMAT_R16G16B16A16_TYPELESS:
+ case DXGI_FORMAT_R32G32_TYPELESS:
+ case DXGI_FORMAT_R32G8X24_TYPELESS:
+ case DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS:
+ case DXGI_FORMAT_X32_TYPELESS_G8X24_UINT:
+ case DXGI_FORMAT_R10G10B10A2_TYPELESS:
+ case DXGI_FORMAT_R8G8B8A8_TYPELESS:
+ case DXGI_FORMAT_R16G16_TYPELESS:
+ case DXGI_FORMAT_R32_TYPELESS:
+ case DXGI_FORMAT_R24G8_TYPELESS:
+ case DXGI_FORMAT_R24_UNORM_X8_TYPELESS:
+ case DXGI_FORMAT_X24_TYPELESS_G8_UINT:
+ case DXGI_FORMAT_R8G8_TYPELESS:
+ case DXGI_FORMAT_R16_TYPELESS:
+ case DXGI_FORMAT_R8_TYPELESS:
+ case DXGI_FORMAT_BC1_TYPELESS:
+ case DXGI_FORMAT_BC2_TYPELESS:
+ case DXGI_FORMAT_BC3_TYPELESS:
+ case DXGI_FORMAT_BC4_TYPELESS:
+ case DXGI_FORMAT_BC5_TYPELESS:
+ case DXGI_FORMAT_B8G8R8A8_TYPELESS:
+ case DXGI_FORMAT_B8G8R8X8_TYPELESS:
+ case DXGI_FORMAT_BC6H_TYPELESS:
+ case DXGI_FORMAT_BC7_TYPELESS:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+inline size_t ComputeScanlines( _In_ DXGI_FORMAT fmt, _In_ size_t height )
+{
+ switch ( fmt )
+ {
+ case DXGI_FORMAT_BC1_TYPELESS:
+ case DXGI_FORMAT_BC1_UNORM:
+ case DXGI_FORMAT_BC1_UNORM_SRGB:
+ case DXGI_FORMAT_BC2_TYPELESS:
+ case DXGI_FORMAT_BC2_UNORM:
+ case DXGI_FORMAT_BC2_UNORM_SRGB:
+ case DXGI_FORMAT_BC3_TYPELESS:
+ case DXGI_FORMAT_BC3_UNORM:
+ case DXGI_FORMAT_BC3_UNORM_SRGB:
+ case DXGI_FORMAT_BC4_TYPELESS:
+ case DXGI_FORMAT_BC4_UNORM:
+ case DXGI_FORMAT_BC4_SNORM:
+ case DXGI_FORMAT_BC5_TYPELESS:
+ case DXGI_FORMAT_BC5_UNORM:
+ case DXGI_FORMAT_BC5_SNORM:
+ case DXGI_FORMAT_BC6H_TYPELESS:
+ case DXGI_FORMAT_BC6H_UF16:
+ case DXGI_FORMAT_BC6H_SF16:
+ case DXGI_FORMAT_BC7_TYPELESS:
+ case DXGI_FORMAT_BC7_UNORM:
+ case DXGI_FORMAT_BC7_UNORM_SRGB:
+ return std::max<size_t>( 1, (height + 3) / 4 );
+
+ default:
+ return height;
+ }
+}
+
+//=====================================================================================
+// Image I/O
+//=====================================================================================
+inline HRESULT SaveToDDSMemory( const Image& image, DWORD flags, Blob& blob )
+{
+ TexMetadata mdata;
+ memset( &mdata, 0, sizeof(mdata) );
+ mdata.width = image.width;
+ mdata.height = image.height;
+ mdata.depth = 1;
+ mdata.arraySize = 1;
+ mdata.mipLevels = 1;
+ mdata.format = image.format;
+ mdata.dimension = TEX_DIMENSION_TEXTURE2D;
+
+ return SaveToDDSMemory( &image, 1, mdata, flags, blob );
+}
+
+inline HRESULT SaveToDDSFile( const Image& image, DWORD flags, LPCWSTR szFile )
+{
+ TexMetadata mdata;
+ memset( &mdata, 0, sizeof(mdata) );
+ mdata.width = image.width;
+ mdata.height = image.height;
+ mdata.depth = 1;
+ mdata.arraySize = 1;
+ mdata.mipLevels = 1;
+ mdata.format = image.format;
+ mdata.dimension = TEX_DIMENSION_TEXTURE2D;
+
+ return SaveToDDSFile( &image, 1, mdata, flags, szFile );
+}
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexCompress.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexCompress.cpp
new file mode 100644
index 00000000..9c1bfbd8
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTexCompress.cpp
@@ -0,0 +1,697 @@
+//-------------------------------------------------------------------------------------
+// DirectXTexCompress.cpp
+//
+// DirectX Texture Library - Texture compression
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#pragma warning(disable : 4616 6001 6993)
+#endif
+
+#include "BC.h"
+
+namespace DirectX
+{
+
+inline static DWORD _GetBCFlags( _In_ DWORD compress )
+{
+ static_assert( TEX_COMPRESS_RGB_DITHER == BC_FLAGS_DITHER_RGB, "TEX_COMPRESS_* flags should match BC_FLAGS_*" );
+ static_assert( TEX_COMPRESS_A_DITHER == BC_FLAGS_DITHER_A, "TEX_COMPRESS_* flags should match BC_FLAGS_*" );
+ static_assert( TEX_COMPRESS_DITHER == (BC_FLAGS_DITHER_RGB | BC_FLAGS_DITHER_A), "TEX_COMPRESS_* flags should match BC_FLAGS_*" );
+ static_assert( TEX_COMPRESS_UNIFORM == BC_FLAGS_UNIFORM, "TEX_COMPRESS_* flags should match BC_FLAGS_*" );
+ return ( compress & (BC_FLAGS_DITHER_RGB|BC_FLAGS_DITHER_A|BC_FLAGS_UNIFORM) );
+}
+
+
+//-------------------------------------------------------------------------------------
+static HRESULT _CompressBC( _In_ const Image& image, _In_ const Image& result, _In_ DWORD bcflags,
+ _In_ float alphaRef, _In_ bool degenerate )
+{
+ if ( !image.pixels || !result.pixels )
+ return E_POINTER;
+
+ assert( image.width == result.width );
+ assert( image.height == result.height );
+
+ const DXGI_FORMAT format = image.format;
+ size_t sbpp = BitsPerPixel( format );
+ if ( !sbpp )
+ return E_FAIL;
+
+ if ( sbpp < 8 )
+ {
+ // We don't support compressing from monochrome (DXGI_FORMAT_R1_UNORM)
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ // Round to bytes
+ sbpp = ( sbpp + 7 ) / 8;
+
+ uint8_t *pDest = result.pixels;
+
+ // Determine BC format encoder
+ BC_ENCODE pfEncode;
+ size_t blocksize;
+ switch(result.format)
+ {
+ case DXGI_FORMAT_BC1_UNORM:
+ case DXGI_FORMAT_BC1_UNORM_SRGB: pfEncode = nullptr; blocksize = 8; break;
+ case DXGI_FORMAT_BC2_UNORM:
+ case DXGI_FORMAT_BC2_UNORM_SRGB: pfEncode = D3DXEncodeBC2; blocksize = 16; break;
+ case DXGI_FORMAT_BC3_UNORM:
+ case DXGI_FORMAT_BC3_UNORM_SRGB: pfEncode = D3DXEncodeBC3; blocksize = 16; break;
+ case DXGI_FORMAT_BC4_UNORM: pfEncode = D3DXEncodeBC4U; blocksize = 8; break;
+ case DXGI_FORMAT_BC4_SNORM: pfEncode = D3DXEncodeBC4S; blocksize = 8; break;
+ case DXGI_FORMAT_BC5_UNORM: pfEncode = D3DXEncodeBC5U; blocksize = 16; break;
+ case DXGI_FORMAT_BC5_SNORM: pfEncode = D3DXEncodeBC5S; blocksize = 16; break;
+ case DXGI_FORMAT_BC6H_UF16: pfEncode = D3DXEncodeBC6HU; blocksize = 16; break;
+ case DXGI_FORMAT_BC6H_SF16: pfEncode = D3DXEncodeBC6HS; blocksize = 16; break;
+ case DXGI_FORMAT_BC7_UNORM:
+ case DXGI_FORMAT_BC7_UNORM_SRGB: pfEncode = D3DXEncodeBC7; blocksize = 16; break;
+ default:
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ XMVECTOR temp[16];
+ const uint8_t *pSrc = image.pixels;
+ const size_t rowPitch = image.rowPitch;
+ for( size_t h=0; h < image.height; h += 4 )
+ {
+ const uint8_t *sptr = pSrc;
+ uint8_t* dptr = pDest;
+ for( size_t count = 0; count < rowPitch; count += sbpp*4 )
+ {
+ if ( !_LoadScanline( &temp[0], 4, sptr, rowPitch, format ) )
+ return E_FAIL;
+
+ if ( image.height > 1 )
+ {
+ if ( !_LoadScanline( &temp[4], 4, sptr + rowPitch, rowPitch, format ) )
+ return E_FAIL;
+
+ if ( image.height > 2 )
+ {
+ if ( !_LoadScanline( &temp[8], 4, sptr + rowPitch*2, rowPitch, format ) )
+ return E_FAIL;
+
+ if ( !_LoadScanline( &temp[12], 4, sptr + rowPitch*3, rowPitch, format ) )
+ return E_FAIL;
+ }
+ }
+
+ if ( degenerate )
+ {
+ assert( image.width < 4 || image.height < 4 );
+ const size_t uSrc[] = { 0, 0, 0, 1 };
+
+ if ( image.width < 4 )
+ {
+ for( size_t t=0; t < image.height && t < 4; ++t )
+ {
+ for( size_t s = image.width; s < 4; ++s )
+ {
+ temp[ t*4 + s ] = temp[ t*4 + uSrc[s] ];
+ }
+ }
+ }
+
+ if ( image.height < 4 )
+ {
+ for( size_t t=image.height; t < 4; ++t )
+ {
+ for( size_t s =0; s < 4; ++s )
+ {
+ temp[ t*4 + s ] = temp[ uSrc[t]*4 + s ];
+ }
+ }
+ }
+ }
+
+ _ConvertScanline( temp, 16, result.format, format, 0 );
+
+ if ( pfEncode )
+ pfEncode( dptr, temp, bcflags );
+ else
+ D3DXEncodeBC1( dptr, temp, alphaRef, bcflags );
+
+ sptr += sbpp*4;
+ dptr += blocksize;
+ }
+
+ pSrc += rowPitch*4;
+ pDest += result.rowPitch;
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+#ifdef _OPENMP
+static HRESULT _CompressBC_Parallel( _In_ const Image& image, _In_ const Image& result, _In_ DWORD bcflags,
+ _In_ float alphaRef )
+{
+ if ( !image.pixels || !result.pixels )
+ return E_POINTER;
+
+ // Parallel version doesn't support degenerate case
+ assert( ((image.width % 4) == 0) && ((image.height % 4) == 0 ) );
+
+ assert( image.width == result.width );
+ assert( image.height == result.height );
+
+ const DXGI_FORMAT format = image.format;
+ size_t sbpp = BitsPerPixel( format );
+ if ( !sbpp )
+ return E_FAIL;
+
+ if ( sbpp < 8 )
+ {
+ // We don't support compressing from monochrome (DXGI_FORMAT_R1_UNORM)
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ // Round to bytes
+ sbpp = ( sbpp + 7 ) / 8;
+
+ // Determine BC format encoder
+ BC_ENCODE pfEncode;
+ size_t blocksize;
+ switch(result.format)
+ {
+ case DXGI_FORMAT_BC1_UNORM:
+ case DXGI_FORMAT_BC1_UNORM_SRGB: pfEncode = nullptr; blocksize = 8; break;
+ case DXGI_FORMAT_BC2_UNORM:
+ case DXGI_FORMAT_BC2_UNORM_SRGB: pfEncode = D3DXEncodeBC2; blocksize = 16; break;
+ case DXGI_FORMAT_BC3_UNORM:
+ case DXGI_FORMAT_BC3_UNORM_SRGB: pfEncode = D3DXEncodeBC3; blocksize = 16; break;
+ case DXGI_FORMAT_BC4_UNORM: pfEncode = D3DXEncodeBC4U; blocksize = 8; break;
+ case DXGI_FORMAT_BC4_SNORM: pfEncode = D3DXEncodeBC4S; blocksize = 8; break;
+ case DXGI_FORMAT_BC5_UNORM: pfEncode = D3DXEncodeBC5U; blocksize = 16; break;
+ case DXGI_FORMAT_BC5_SNORM: pfEncode = D3DXEncodeBC5S; blocksize = 16; break;
+ case DXGI_FORMAT_BC6H_UF16: pfEncode = D3DXEncodeBC6HU; blocksize = 16; break;
+ case DXGI_FORMAT_BC6H_SF16: pfEncode = D3DXEncodeBC6HS; blocksize = 16; break;
+ case DXGI_FORMAT_BC7_UNORM:
+ case DXGI_FORMAT_BC7_UNORM_SRGB: pfEncode = D3DXEncodeBC7; blocksize = 16; break;
+ default:
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ // Refactored version of loop to support parallel independance
+ const size_t nBlocks = std::max<size_t>(1, image.width / 4) * std::max<size_t>(1, image.height / 4);
+
+ bool fail = false;
+
+#pragma omp parallel for
+ for( int nb=0; nb < static_cast<int>( nBlocks ); ++nb )
+ {
+ const size_t nbWidth = std::max<size_t>(1, image.width / 4);
+
+ const size_t y = nb / nbWidth;
+ const size_t x = nb - (y*nbWidth);
+
+ assert( x < image.width && y < image.height );
+
+ size_t rowPitch = image.rowPitch;
+ const uint8_t *pSrc = image.pixels + (y*4*rowPitch) + (x*4*sbpp);
+
+ uint8_t *pDest = result.pixels + (nb*blocksize);
+
+ XMVECTOR temp[16];
+ if ( !_LoadScanline( &temp[0], 4, pSrc, rowPitch, format ) )
+ fail = true;
+
+ if ( !_LoadScanline( &temp[4], 4, pSrc + rowPitch, rowPitch, format ) )
+ fail = true;
+
+ if ( !_LoadScanline( &temp[8], 4, pSrc + rowPitch*2, rowPitch, format ) )
+ fail = true;
+
+ if ( !_LoadScanline( &temp[12], 4, pSrc + rowPitch*3, rowPitch, format ) )
+ fail = true;
+
+ _ConvertScanline( temp, 16, result.format, format, 0 );
+
+ if ( pfEncode )
+ pfEncode( pDest, temp, bcflags );
+ else
+ D3DXEncodeBC1( pDest, temp, alphaRef, bcflags );
+ }
+
+ return (fail) ? E_FAIL : S_OK;
+}
+
+#endif // _OPENMP
+
+
+//-------------------------------------------------------------------------------------
+static DXGI_FORMAT _DefaultDecompress( _In_ DXGI_FORMAT format )
+{
+ switch( format )
+ {
+ case DXGI_FORMAT_BC1_TYPELESS:
+ case DXGI_FORMAT_BC1_UNORM:
+ case DXGI_FORMAT_BC2_TYPELESS:
+ case DXGI_FORMAT_BC2_UNORM:
+ case DXGI_FORMAT_BC3_TYPELESS:
+ case DXGI_FORMAT_BC3_UNORM:
+ case DXGI_FORMAT_BC7_TYPELESS:
+ case DXGI_FORMAT_BC7_UNORM:
+ return DXGI_FORMAT_R8G8B8A8_UNORM;
+
+ case DXGI_FORMAT_BC1_UNORM_SRGB:
+ case DXGI_FORMAT_BC2_UNORM_SRGB:
+ case DXGI_FORMAT_BC3_UNORM_SRGB:
+ case DXGI_FORMAT_BC7_UNORM_SRGB:
+ return DXGI_FORMAT_R8G8B8A8_UNORM_SRGB;
+
+ case DXGI_FORMAT_BC4_TYPELESS:
+ case DXGI_FORMAT_BC4_UNORM:
+ return DXGI_FORMAT_R8_UNORM;
+
+ case DXGI_FORMAT_BC4_SNORM:
+ return DXGI_FORMAT_R8_SNORM;
+
+ case DXGI_FORMAT_BC5_TYPELESS:
+ case DXGI_FORMAT_BC5_UNORM:
+ return DXGI_FORMAT_R8G8_UNORM;
+
+ case DXGI_FORMAT_BC5_SNORM:
+ return DXGI_FORMAT_R8G8_SNORM;
+
+ case DXGI_FORMAT_BC6H_TYPELESS:
+ case DXGI_FORMAT_BC6H_UF16:
+ case DXGI_FORMAT_BC6H_SF16:
+ // We could use DXGI_FORMAT_R32G32B32_FLOAT here since BC6H is always Alpha 1.0,
+ // but this format is more supported by viewers
+ return DXGI_FORMAT_R32G32B32A32_FLOAT;
+
+ default:
+ return DXGI_FORMAT_UNKNOWN;
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+static HRESULT _DecompressBC( _In_ const Image& cImage, _In_ const Image& result )
+{
+ if ( !cImage.pixels || !result.pixels )
+ return E_POINTER;
+
+ assert( cImage.width == result.width );
+ assert( cImage.height == result.height );
+
+ // Image must be a multiple of 4 (degenerate cases of 1x1, 1x2, 2x1, and 2x2 are allowed)
+ size_t width = cImage.width;
+ if ( (width % 4) != 0 )
+ {
+ if ( width != 1 && width != 2 )
+ return E_INVALIDARG;
+ }
+
+ size_t height = cImage.height;
+ if ( (height % 4) != 0 )
+ {
+ if ( height != 1 && height != 2 )
+ return E_INVALIDARG;
+ }
+
+ const DXGI_FORMAT format = result.format;
+ size_t dbpp = BitsPerPixel( format );
+ if ( !dbpp )
+ return E_FAIL;
+
+ if ( dbpp < 8 )
+ {
+ // We don't support decompressing to monochrome (DXGI_FORMAT_R1_UNORM)
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ // Round to bytes
+ dbpp = ( dbpp + 7 ) / 8;
+
+ uint8_t *pDest = result.pixels;
+ if ( !pDest )
+ return E_POINTER;
+
+ // Promote "typeless" BC formats
+ DXGI_FORMAT cformat;
+ switch( cImage.format )
+ {
+ case DXGI_FORMAT_BC1_TYPELESS: cformat = DXGI_FORMAT_BC1_UNORM; break;
+ case DXGI_FORMAT_BC2_TYPELESS: cformat = DXGI_FORMAT_BC2_UNORM; break;
+ case DXGI_FORMAT_BC3_TYPELESS: cformat = DXGI_FORMAT_BC3_UNORM; break;
+ case DXGI_FORMAT_BC4_TYPELESS: cformat = DXGI_FORMAT_BC4_UNORM; break;
+ case DXGI_FORMAT_BC5_TYPELESS: cformat = DXGI_FORMAT_BC5_UNORM; break;
+ case DXGI_FORMAT_BC6H_TYPELESS: cformat = DXGI_FORMAT_BC6H_UF16; break;
+ case DXGI_FORMAT_BC7_TYPELESS: cformat = DXGI_FORMAT_BC7_UNORM; break;
+ default: cformat = cImage.format; break;
+ }
+
+ // Determine BC format decoder
+ BC_DECODE pfDecode;
+ size_t sbpp;
+ switch(cformat)
+ {
+ case DXGI_FORMAT_BC1_UNORM:
+ case DXGI_FORMAT_BC1_UNORM_SRGB: pfDecode = D3DXDecodeBC1; sbpp = 8; break;
+ case DXGI_FORMAT_BC2_UNORM:
+ case DXGI_FORMAT_BC2_UNORM_SRGB: pfDecode = D3DXDecodeBC2; sbpp = 16; break;
+ case DXGI_FORMAT_BC3_UNORM:
+ case DXGI_FORMAT_BC3_UNORM_SRGB: pfDecode = D3DXDecodeBC3; sbpp = 16; break;
+ case DXGI_FORMAT_BC4_UNORM: pfDecode = D3DXDecodeBC4U; sbpp = 8; break;
+ case DXGI_FORMAT_BC4_SNORM: pfDecode = D3DXDecodeBC4S; sbpp = 8; break;
+ case DXGI_FORMAT_BC5_UNORM: pfDecode = D3DXDecodeBC5U; sbpp = 16; break;
+ case DXGI_FORMAT_BC5_SNORM: pfDecode = D3DXDecodeBC5S; sbpp = 16; break;
+ case DXGI_FORMAT_BC6H_UF16: pfDecode = D3DXDecodeBC6HU; sbpp = 16; break;
+ case DXGI_FORMAT_BC6H_SF16: pfDecode = D3DXDecodeBC6HS; sbpp = 16; break;
+ case DXGI_FORMAT_BC7_UNORM:
+ case DXGI_FORMAT_BC7_UNORM_SRGB: pfDecode = D3DXDecodeBC7; sbpp = 16; break;
+ default:
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ XMVECTOR temp[16];
+ const uint8_t *pSrc = cImage.pixels;
+ const size_t rowPitch = result.rowPitch;
+ for( size_t h=0; h < cImage.height; h += 4 )
+ {
+ const uint8_t *sptr = pSrc;
+ uint8_t* dptr = pDest;
+ for( size_t count = 0; count < cImage.rowPitch; count += sbpp )
+ {
+ pfDecode( temp, sptr );
+ _ConvertScanline( temp, 16, format, cformat, 0 );
+
+ if ( !_StoreScanline( dptr, rowPitch, format, &temp[0], 4 ) )
+ return E_FAIL;
+
+ if ( result.height > 1 )
+ {
+ if ( !_StoreScanline( dptr + rowPitch, rowPitch, format, &temp[4], 4 ) )
+ return E_FAIL;
+
+ if ( result.height > 2 )
+ {
+ if ( !_StoreScanline( dptr + rowPitch*2, rowPitch, format, &temp[8], 4 ) )
+ return E_FAIL;
+
+ if ( !_StoreScanline( dptr + rowPitch*3, rowPitch, format, &temp[12], 4 ) )
+ return E_FAIL;
+ }
+ }
+
+ sptr += sbpp;
+ dptr += dbpp*4;
+ }
+
+ pSrc += cImage.rowPitch;
+ pDest += rowPitch*4;
+ }
+
+ return S_OK;
+}
+
+
+//=====================================================================================
+// Entry-points
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// Compression
+//-------------------------------------------------------------------------------------
+HRESULT Compress( const Image& srcImage, DXGI_FORMAT format, DWORD compress, float alphaRef, ScratchImage& image )
+{
+ if ( IsCompressed(srcImage.format) || !IsCompressed(format) || IsTypeless(format) )
+ return E_INVALIDARG;
+
+ // Image size must be a multiple of 4 (degenerate cases for mipmaps are allowed)
+ bool degenerate = false;
+
+ size_t width = srcImage.width;
+ if ( (width % 4) != 0 )
+ {
+ if ( width != 1 && width != 2 )
+ return E_INVALIDARG;
+
+ degenerate = true;
+ }
+
+ size_t height = srcImage.height;
+ if ( (height % 4) != 0 )
+ {
+ if ( height != 1 && height != 2 )
+ return E_INVALIDARG;
+
+ degenerate = true;
+ }
+
+ // Create compressed image
+ HRESULT hr = image.Initialize2D( format, width, height, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *img = image.GetImage( 0, 0, 0 );
+ if ( !img )
+ {
+ image.Release();
+ return E_POINTER;
+ }
+
+ // Compress single image
+ if ( (compress & TEX_COMPRESS_PARALLEL) && !degenerate )
+ {
+#ifndef _OPENMP
+ return E_NOTIMPL;
+#else
+ hr = _CompressBC_Parallel( srcImage, *img, _GetBCFlags( compress ), alphaRef );
+#endif // _OPENMP
+ }
+ else
+ {
+ hr = _CompressBC( srcImage, *img, _GetBCFlags( compress ), alphaRef, degenerate );
+ }
+
+ if ( FAILED(hr) )
+ image.Release();
+
+ return hr;
+}
+
+HRESULT Compress( const Image* srcImages, size_t nimages, const TexMetadata& metadata,
+ DXGI_FORMAT format, DWORD compress, float alphaRef, ScratchImage& cImages )
+{
+ if ( !srcImages || !nimages )
+ return E_INVALIDARG;
+
+ if ( !IsCompressed(format) || IsTypeless(format) )
+ return E_INVALIDARG;
+
+ // Image size must be a multiple of 4 (degenerate cases for mipmaps are allowed)
+ size_t width = srcImages[0].width;
+ if ( (width % 4) != 0 )
+ {
+ if ( width != 1 && width != 2 )
+ return E_INVALIDARG;
+ }
+
+ size_t height = srcImages[0].height;
+ if ( (height % 4) != 0 )
+ {
+ if ( height != 1 && height != 2 )
+ return E_INVALIDARG;
+ }
+
+ cImages.Release();
+
+ TexMetadata mdata2 = metadata;
+ mdata2.format = format;
+ HRESULT hr = cImages.Initialize( mdata2 );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( nimages != cImages.GetImageCount() )
+ {
+ cImages.Release();
+ return E_FAIL;
+ }
+
+ const Image* dest = cImages.GetImages();
+ if ( !dest )
+ {
+ cImages.Release();
+ return E_POINTER;
+ }
+
+ for( size_t index=0; index < nimages; ++index )
+ {
+ assert( dest[ index ].format == format );
+
+ const Image& src = srcImages[ index ];
+
+ height = src.height;
+ width = src.width;
+ if ( width != dest[ index ].width || height != dest[ index ].height )
+ {
+ cImages.Release();
+ return E_FAIL;
+ }
+
+ bool degenerate = ((height < 4) || (width < 4)) != 0;
+
+ if ( (compress & TEX_COMPRESS_PARALLEL) && !degenerate)
+ {
+#ifndef _OPENMP
+ return E_NOTIMPL;
+#else
+ if ( compress & TEX_COMPRESS_PARALLEL )
+ {
+ hr = _CompressBC_Parallel( src, dest[ index ], _GetBCFlags( compress ), alphaRef );
+ if ( FAILED(hr) )
+ {
+ cImages.Release();
+ return hr;
+ }
+ }
+#endif // _OPENMP
+ }
+ else
+ {
+ hr = _CompressBC( src, dest[ index ], _GetBCFlags( compress ), alphaRef, degenerate );
+ if ( FAILED(hr) )
+ {
+ cImages.Release();
+ return hr;
+ }
+ }
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Decompression
+//-------------------------------------------------------------------------------------
+HRESULT Decompress( const Image& cImage, DXGI_FORMAT format, ScratchImage& image )
+{
+ if ( IsCompressed(format) || IsTypeless(format) )
+ return E_INVALIDARG;
+
+ if ( format == DXGI_FORMAT_UNKNOWN )
+ {
+ // Pick a default decompressed format based on BC input format
+ format = _DefaultDecompress( cImage.format );
+ if ( format == DXGI_FORMAT_UNKNOWN )
+ {
+ // Input is not a compressed format
+ return E_INVALIDARG;
+ }
+ }
+ else if ( !IsCompressed(cImage.format) || !IsValid(format) )
+ return E_INVALIDARG;
+
+ // Create decompressed image
+ HRESULT hr = image.Initialize2D( format, cImage.width, cImage.height, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *img = image.GetImage( 0, 0, 0 );
+ if ( !img )
+ {
+ image.Release();
+ return E_POINTER;
+ }
+
+ // Decompress single image
+ hr = _DecompressBC( cImage, *img );
+ if ( FAILED(hr) )
+ image.Release();
+
+ return hr;
+}
+
+HRESULT Decompress( const Image* cImages, size_t nimages, const TexMetadata& metadata,
+ DXGI_FORMAT format, ScratchImage& images )
+{
+ if ( !cImages || !nimages )
+ return E_INVALIDARG;
+
+ if ( IsCompressed(format) || IsTypeless(format) )
+ return E_INVALIDARG;
+
+ if ( format == DXGI_FORMAT_UNKNOWN )
+ {
+ // Pick a default decompressed format based on BC input format
+ format = _DefaultDecompress( cImages[0].format );
+ if ( format == DXGI_FORMAT_UNKNOWN )
+ {
+ // Input is not a compressed format
+ return E_FAIL;
+ }
+ }
+ else if ( !IsValid(format) )
+ return E_INVALIDARG;
+
+ images.Release();
+
+ TexMetadata mdata2 = metadata;
+ mdata2.format = format;
+ HRESULT hr = images.Initialize( mdata2 );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( nimages != images.GetImageCount() )
+ {
+ images.Release();
+ return E_FAIL;
+ }
+
+ const Image* dest = images.GetImages();
+ if ( !dest )
+ {
+ images.Release();
+ return E_POINTER;
+ }
+
+ for( size_t index=0; index < nimages; ++index )
+ {
+ assert( dest[ index ].format == format );
+
+ const Image& src = cImages[ index ];
+ if ( !IsCompressed( src.format ) )
+ {
+ images.Release();
+ return E_FAIL;
+ }
+
+ if ( src.width != dest[ index ].width || src.height != dest[ index ].height )
+ {
+ images.Release();
+ return E_FAIL;
+ }
+
+ hr = _DecompressBC( src, dest[ index ] );
+ if ( FAILED(hr) )
+ {
+ images.Release();
+ return hr;
+ }
+ }
+
+ return S_OK;
+}
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexConvert.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexConvert.cpp
new file mode 100644
index 00000000..7c607346
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTexConvert.cpp
@@ -0,0 +1,2420 @@
+//-------------------------------------------------------------------------------------
+// DirectXTexConvert.cpp
+//
+// DirectX Texture Library - Image conversion
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+#ifdef USE_XNAMATH
+#if XNAMATH_VERSION < 205
+#error This file requires XNAMATH v2.05 or later
+#endif
+#else
+using namespace DirectX::PackedVector;
+#endif
+
+namespace DirectX
+{
+
+//-------------------------------------------------------------------------------------
+// Copies an image row with optional clearing of alpha value to 1.0
+// (can be used in place as well) otherwise copies the image row unmodified.
+//-------------------------------------------------------------------------------------
+void _CopyScanline( LPVOID pDestination, size_t outSize, LPCVOID pSource, size_t inSize, DXGI_FORMAT format, DWORD flags )
+{
+ assert( pDestination && outSize > 0 );
+ assert( pSource && inSize > 0 );
+ assert( IsValid(format) && !IsVideo(format) );
+
+ if ( flags & TEXP_SCANLINE_SETALPHA )
+ {
+ switch( format )
+ {
+ //-----------------------------------------------------------------------------
+ case DXGI_FORMAT_R32G32B32A32_TYPELESS:
+ case DXGI_FORMAT_R32G32B32A32_FLOAT:
+ case DXGI_FORMAT_R32G32B32A32_UINT:
+ case DXGI_FORMAT_R32G32B32A32_SINT:
+ {
+ uint32_t alpha;
+ if ( format == DXGI_FORMAT_R32G32B32A32_FLOAT )
+ alpha = 0x3f800000;
+ else if ( format == DXGI_FORMAT_R32G32B32A32_SINT )
+ alpha = 0x7fffffff;
+ else
+ alpha = 0xffffffff;
+
+ if ( pDestination == pSource )
+ {
+ uint32_t *dPtr = reinterpret_cast<uint32_t*>(pDestination);
+ for( size_t count = 0; count < outSize; count += 16 )
+ {
+ dPtr += 3;
+ *(dPtr++) = alpha;
+ }
+ }
+ else
+ {
+ const uint32_t * __restrict sPtr = reinterpret_cast<const uint32_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+ size_t size = std::min<size_t>( outSize, inSize );
+ for( size_t count = 0; count < size; count += 16 )
+ {
+ *(dPtr++) = *(sPtr++);
+ *(dPtr++) = *(sPtr++);
+ *(dPtr++) = *(sPtr++);
+ *(dPtr++) = alpha;
+ sPtr++;
+ }
+ }
+ }
+ return;
+
+ //-----------------------------------------------------------------------------
+ case DXGI_FORMAT_R16G16B16A16_TYPELESS:
+ case DXGI_FORMAT_R16G16B16A16_FLOAT:
+ case DXGI_FORMAT_R16G16B16A16_UNORM:
+ case DXGI_FORMAT_R16G16B16A16_UINT:
+ case DXGI_FORMAT_R16G16B16A16_SNORM:
+ case DXGI_FORMAT_R16G16B16A16_SINT:
+ {
+ uint16_t alpha;
+ if ( format == DXGI_FORMAT_R16G16B16A16_FLOAT )
+ alpha = 0x3c00;
+ else if ( format == DXGI_FORMAT_R16G16B16A16_SNORM || format == DXGI_FORMAT_R16G16B16A16_SINT )
+ alpha = 0x7fff;
+ else
+ alpha = 0xffff;
+
+ if ( pDestination == pSource )
+ {
+ uint16_t *dPtr = reinterpret_cast<uint16_t*>(pDestination);
+ for( size_t count = 0; count < outSize; count += 8 )
+ {
+ dPtr += 3;
+ *(dPtr++) = alpha;
+ }
+ }
+ else
+ {
+ const uint16_t * __restrict sPtr = reinterpret_cast<const uint16_t*>(pSource);
+ uint16_t * __restrict dPtr = reinterpret_cast<uint16_t*>(pDestination);
+ size_t size = std::min<size_t>( outSize, inSize );
+ for( size_t count = 0; count < size; count += 8 )
+ {
+ *(dPtr++) = *(sPtr++);
+ *(dPtr++) = *(sPtr++);
+ *(dPtr++) = *(sPtr++);
+ *(dPtr++) = alpha;
+ sPtr++;
+ }
+ }
+ }
+ return;
+
+ //-----------------------------------------------------------------------------
+ case DXGI_FORMAT_R10G10B10A2_TYPELESS:
+ case DXGI_FORMAT_R10G10B10A2_UNORM:
+ case DXGI_FORMAT_R10G10B10A2_UINT:
+ case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM:
+ if ( pDestination == pSource )
+ {
+ uint32_t *dPtr = reinterpret_cast<uint32_t*>(pDestination);
+ for( size_t count = 0; count < outSize; count += 4 )
+ {
+#pragma warning(suppress: 6001 6101) // PREFast doesn't properly understand the aliasing here.
+ *dPtr |= 0xC0000000;
+ ++dPtr;
+ }
+ }
+ else
+ {
+ const uint32_t * __restrict sPtr = reinterpret_cast<const uint32_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+ size_t size = std::min<size_t>( outSize, inSize );
+ for( size_t count = 0; count < size; count += 4 )
+ {
+ *(dPtr++) = *(sPtr++) | 0xC0000000;
+ }
+ }
+ return;
+
+ //-----------------------------------------------------------------------------
+ case DXGI_FORMAT_R8G8B8A8_TYPELESS:
+ case DXGI_FORMAT_R8G8B8A8_UNORM:
+ case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
+ case DXGI_FORMAT_R8G8B8A8_UINT:
+ case DXGI_FORMAT_R8G8B8A8_SNORM:
+ case DXGI_FORMAT_R8G8B8A8_SINT:
+ case DXGI_FORMAT_B8G8R8A8_UNORM:
+ case DXGI_FORMAT_B8G8R8A8_TYPELESS:
+ case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+ {
+ const uint32_t alpha = ( format == DXGI_FORMAT_R8G8B8A8_SNORM || format == DXGI_FORMAT_R8G8B8A8_SINT ) ? 0x7f000000 : 0xff000000;
+
+ if ( pDestination == pSource )
+ {
+ uint32_t *dPtr = reinterpret_cast<uint32_t*>(pDestination);
+ for( size_t count = 0; count < outSize; count += 4 )
+ {
+ uint32_t t = *dPtr & 0xFFFFFF;
+ t |= alpha;
+ *(dPtr++) = t;
+ }
+ }
+ else
+ {
+ const uint32_t * __restrict sPtr = reinterpret_cast<const uint32_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+ size_t size = std::min<size_t>( outSize, inSize );
+ for( size_t count = 0; count < size; count += 4 )
+ {
+ uint32_t t = *(sPtr++) & 0xFFFFFF;
+ t |= alpha;
+ *(dPtr++) = t;
+ }
+ }
+ }
+ return;
+
+ //-----------------------------------------------------------------------------
+ case DXGI_FORMAT_B5G5R5A1_UNORM:
+ if ( pDestination == pSource )
+ {
+ uint16_t *dPtr = reinterpret_cast<uint16_t*>(pDestination);
+ for( size_t count = 0; count < outSize; count += 2 )
+ {
+ *(dPtr++) |= 0x8000;
+ }
+ }
+ else
+ {
+ const uint16_t * __restrict sPtr = reinterpret_cast<const uint16_t*>(pSource);
+ uint16_t * __restrict dPtr = reinterpret_cast<uint16_t*>(pDestination);
+ size_t size = std::min<size_t>( outSize, inSize );
+ for( size_t count = 0; count < size; count += 2 )
+ {
+ *(dPtr++) = *(sPtr++) | 0x8000;
+ }
+ }
+ return;
+
+ //-----------------------------------------------------------------------------
+ case DXGI_FORMAT_A8_UNORM:
+ memset( pDestination, 0xff, outSize );
+ return;
+
+#ifdef DXGI_1_2_FORMATS
+ //-----------------------------------------------------------------------------
+ case DXGI_FORMAT_B4G4R4A4_UNORM:
+ if ( pDestination == pSource )
+ {
+ uint16_t *dPtr = reinterpret_cast<uint16_t*>(pDestination);
+ for( size_t count = 0; count < outSize; count += 2 )
+ {
+ *(dPtr++) |= 0xF000;
+ }
+ }
+ else
+ {
+ const uint16_t * __restrict sPtr = reinterpret_cast<const uint16_t*>(pSource);
+ uint16_t * __restrict dPtr = reinterpret_cast<uint16_t*>(pDestination);
+ size_t size = std::min<size_t>( outSize, inSize );
+ for( size_t count = 0; count < size; count += 2 )
+ {
+ *(dPtr++) = *(sPtr++) | 0xF000;
+ }
+ }
+ return;
+#endif // DXGI_1_2_FORMATS
+ }
+ }
+
+ // Fall-through case is to just use memcpy (assuming this is not an in-place operation)
+ if ( pDestination == pSource )
+ return;
+
+ size_t size = std::min<size_t>( outSize, inSize );
+ memcpy_s( pDestination, outSize, pSource, size );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Swizzles (RGB <-> BGR) an image row with optional clearing of alpha value to 1.0
+// (can be used in place as well) otherwise copies the image row unmodified.
+//-------------------------------------------------------------------------------------
+void _SwizzleScanline( LPVOID pDestination, size_t outSize, LPCVOID pSource, size_t inSize, DXGI_FORMAT format, DWORD flags )
+{
+ assert( pDestination && outSize > 0 );
+ assert( pSource && inSize > 0 );
+ assert( IsValid(format) && !IsVideo(format) );
+
+ switch( format )
+ {
+ //---------------------------------------------------------------------------------
+ case DXGI_FORMAT_R10G10B10A2_TYPELESS:
+ case DXGI_FORMAT_R10G10B10A2_UNORM:
+ case DXGI_FORMAT_R10G10B10A2_UINT:
+ case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM:
+ if ( flags & TEXP_SCANLINE_LEGACY )
+ {
+ // Swap Red (R) and Blue (B) channel (used for D3DFMT_A2R10G10B10 legacy sources)
+ if ( pDestination == pSource )
+ {
+ uint32_t *dPtr = reinterpret_cast<uint32_t*>(pDestination);
+ for( size_t count = 0; count < outSize; count += 4 )
+ {
+#pragma warning(suppress: 6001 6101) // PREFast doesn't properly understand the aliasing here.
+ uint32_t t = *dPtr;
+
+ uint32_t t1 = (t & 0x3ff00000) >> 20;
+ uint32_t t2 = (t & 0x000003ff) << 20;
+ uint32_t t3 = (t & 0x000ffc00);
+ uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xC0000000 : (t & 0xC0000000);
+
+ *(dPtr++) = t1 | t2 | t3 | ta;
+ }
+ }
+ else
+ {
+ const uint32_t * __restrict sPtr = reinterpret_cast<const uint32_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+ size_t size = std::min<size_t>( outSize, inSize );
+ for( size_t count = 0; count < size; count += 4 )
+ {
+ uint32_t t = *(sPtr++);
+
+ uint32_t t1 = (t & 0x3ff00000) >> 20;
+ uint32_t t2 = (t & 0x000003ff) << 20;
+ uint32_t t3 = (t & 0x000ffc00);
+ uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xC0000000 : (t & 0xC0000000);
+
+ *(dPtr++) = t1 | t2 | t3 | ta;
+ }
+ }
+ return;
+ }
+ break;
+
+ //---------------------------------------------------------------------------------
+ case DXGI_FORMAT_R8G8B8A8_TYPELESS:
+ case DXGI_FORMAT_R8G8B8A8_UNORM:
+ case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
+ case DXGI_FORMAT_B8G8R8A8_UNORM:
+ case DXGI_FORMAT_B8G8R8X8_UNORM:
+ case DXGI_FORMAT_B8G8R8A8_TYPELESS:
+ case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+ case DXGI_FORMAT_B8G8R8X8_TYPELESS:
+ case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
+ // Swap Red (R) and Blue (B) channels (used to convert from DXGI 1.1 BGR formats to DXGI 1.0 RGB)
+ if ( pDestination == pSource )
+ {
+ uint32_t *dPtr = reinterpret_cast<uint32_t*>(pDestination);
+ for( size_t count = 0; count < outSize; count += 4 )
+ {
+ uint32_t t = *dPtr;
+
+ uint32_t t1 = (t & 0x00ff0000) >> 16;
+ uint32_t t2 = (t & 0x000000ff) << 16;
+ uint32_t t3 = (t & 0x0000ff00);
+ uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : (t & 0xFF000000);
+
+ *(dPtr++) = t1 | t2 | t3 | ta;
+ }
+ }
+ else
+ {
+ const uint32_t * __restrict sPtr = reinterpret_cast<const uint32_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+ size_t size = std::min<size_t>( outSize, inSize );
+ for( size_t count = 0; count < size; count += 4 )
+ {
+ uint32_t t = *(sPtr++);
+
+ uint32_t t1 = (t & 0x00ff0000) >> 16;
+ uint32_t t2 = (t & 0x000000ff) << 16;
+ uint32_t t3 = (t & 0x0000ff00);
+ uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : (t & 0xFF000000);
+
+ *(dPtr++) = t1 | t2 | t3 | ta;
+ }
+ }
+ return;
+ }
+
+ // Fall-through case is to just use memcpy (assuming this is not an in-place operation)
+ if ( pDestination == pSource )
+ return;
+
+ size_t size = std::min<size_t>( outSize, inSize );
+ memcpy_s( pDestination, outSize, pSource, size );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Converts an image row with optional clearing of alpha value to 1.0
+// Returns true if supported, false if expansion case not supported
+//-------------------------------------------------------------------------------------
+bool _ExpandScanline( LPVOID pDestination, size_t outSize, DXGI_FORMAT outFormat,
+ LPCVOID pSource, size_t inSize, DXGI_FORMAT inFormat, DWORD flags )
+{
+ assert( pDestination && outSize > 0 );
+ assert( pSource && inSize > 0 );
+ assert( IsValid(outFormat) && !IsVideo(outFormat) );
+ assert( IsValid(inFormat) && !IsVideo(inFormat) );
+
+ switch( inFormat )
+ {
+ case DXGI_FORMAT_B5G6R5_UNORM:
+ if ( outFormat != DXGI_FORMAT_R8G8B8A8_UNORM )
+ return false;
+
+ // DXGI_FORMAT_B5G6R5_UNORM -> DXGI_FORMAT_R8G8B8A8_UNORM
+ {
+ const uint16_t * __restrict sPtr = reinterpret_cast<const uint16_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+
+ for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); icount += 2, ocount += 4 )
+ {
+ uint16_t t = *(sPtr++);
+
+ uint32_t t1 = ((t & 0xf800) >> 8) | ((t & 0xe000) >> 13);
+ uint32_t t2 = ((t & 0x07e0) << 5) | ((t & 0x0600) >> 5);
+ uint32_t t3 = ((t & 0x001f) << 19) | ((t & 0x001c) << 14);
+
+ *(dPtr++) = t1 | t2 | t3 | 0xff000000;
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_B5G5R5A1_UNORM:
+ if ( outFormat != DXGI_FORMAT_R8G8B8A8_UNORM )
+ return false;
+
+ // DXGI_FORMAT_B5G5R5A1_UNORM -> DXGI_FORMAT_R8G8B8A8_UNORM
+ {
+ const uint16_t * __restrict sPtr = reinterpret_cast<const uint16_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+
+ for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); icount += 2, ocount += 4 )
+ {
+ uint16_t t = *(sPtr++);
+
+ uint32_t t1 = ((t & 0x7c00) >> 7) | ((t & 0x7000) >> 12);
+ uint32_t t2 = ((t & 0x03e0) << 6) | ((t & 0x0380) << 1);
+ uint32_t t3 = ((t & 0x001f) << 19) | ((t & 0x001c) << 14);
+ uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : ((t & 0x8000) ? 0xff000000 : 0);
+
+ *(dPtr++) = t1 | t2 | t3 | ta;
+ }
+ }
+ return true;
+
+#ifdef DXGI_1_2_FORMATS
+ case DXGI_FORMAT_B4G4R4A4_UNORM:
+ if ( outFormat != DXGI_FORMAT_R8G8B8A8_UNORM )
+ return false;
+
+ // DXGI_FORMAT_B4G4R4A4_UNORM -> DXGI_FORMAT_R8G8B8A8_UNORM
+ {
+ const uint16_t * __restrict sPtr = reinterpret_cast<const uint16_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+
+ for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); icount += 2, ocount += 4 )
+ {
+ uint16_t t = *(sPtr++);
+
+ uint32_t t1 = ((t & 0x0f00) >> 4) | ((t & 0x0f00) >> 8);
+ uint32_t t2 = ((t & 0x00f0) << 8) | ((t & 0x00f0) << 4);
+ uint32_t t3 = ((t & 0x000f) << 20) | ((t & 0x000f) << 16);
+ uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : (((t & 0xf000) << 16) | ((t & 0xf000) << 12));
+
+ *(dPtr++) = t1 | t2 | t3 | ta;
+ }
+ }
+ return true;
+#endif // DXGI_1_2_FORMATS
+ }
+
+ return false;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Loads an image row into standard RGBA XMVECTOR (aligned) array
+//-------------------------------------------------------------------------------------
+#define LOAD_SCANLINE( type, func )\
+ if ( size >= sizeof(type) )\
+ {\
+ const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
+ for( size_t icount = 0; icount < size; icount += sizeof(type) )\
+ {\
+ if ( dPtr >= ePtr ) break;\
+ *(dPtr++) = func( sPtr++ );\
+ }\
+ return true;\
+ }\
+ return false;
+
+#define LOAD_SCANLINE3( type, func, defvec )\
+ if ( size >= sizeof(type) )\
+ {\
+ const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
+ for( size_t icount = 0; icount < size; icount += sizeof(type) )\
+ {\
+ XMVECTOR v = func( sPtr++ );\
+ if ( dPtr >= ePtr ) break;\
+ *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1110 );\
+ }\
+ return true;\
+ }\
+ return false;
+
+#define LOAD_SCANLINE2( type, func, defvec )\
+ if ( size >= sizeof(type) )\
+ {\
+ const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
+ for( size_t icount = 0; icount < size; icount += sizeof(type) )\
+ {\
+ XMVECTOR v = func( sPtr++ );\
+ if ( dPtr >= ePtr ) break;\
+ *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1100 );\
+ }\
+ return true;\
+ }\
+ return false;
+
+bool _LoadScanline( XMVECTOR* pDestination, size_t count,
+ LPCVOID pSource, size_t size, DXGI_FORMAT format )
+{
+ assert( pDestination && count > 0 && (((uintptr_t)pDestination & 0xF) == 0) );
+ assert( pSource && size > 0 );
+ assert( IsValid(format) && !IsVideo(format) && !IsTypeless(format) && !IsCompressed(format) );
+
+ XMVECTOR* __restrict dPtr = pDestination;
+ if ( !dPtr )
+ return false;
+
+ const XMVECTOR* ePtr = pDestination + count;
+
+ switch( format )
+ {
+ case DXGI_FORMAT_R32G32B32A32_FLOAT:
+ {
+ size_t msize = (size > (sizeof(XMVECTOR)*count)) ? (sizeof(XMVECTOR)*count) : size;
+ memcpy_s( dPtr, sizeof(XMVECTOR)*count, pSource, msize );
+ }
+ return true;
+
+ case DXGI_FORMAT_R32G32B32A32_UINT:
+ LOAD_SCANLINE( XMUINT4, XMLoadUInt4 )
+
+ case DXGI_FORMAT_R32G32B32A32_SINT:
+ LOAD_SCANLINE( XMINT4, XMLoadSInt4 )
+
+ case DXGI_FORMAT_R32G32B32_FLOAT:
+ LOAD_SCANLINE3( XMFLOAT3, XMLoadFloat3, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_R32G32B32_UINT:
+ LOAD_SCANLINE3( XMUINT3, XMLoadUInt3, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_R32G32B32_SINT:
+ LOAD_SCANLINE3( XMINT3, XMLoadSInt3, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_R16G16B16A16_FLOAT:
+ LOAD_SCANLINE( XMHALF4, XMLoadHalf4 )
+
+ case DXGI_FORMAT_R16G16B16A16_UNORM:
+ LOAD_SCANLINE( XMUSHORTN4, XMLoadUShortN4 )
+
+ case DXGI_FORMAT_R16G16B16A16_UINT:
+ LOAD_SCANLINE( XMUSHORT4, XMLoadUShort4 )
+
+ case DXGI_FORMAT_R16G16B16A16_SNORM:
+ LOAD_SCANLINE( XMSHORTN4, XMLoadShortN4 )
+
+ case DXGI_FORMAT_R16G16B16A16_SINT:
+ LOAD_SCANLINE( XMSHORT4, XMLoadShort4 )
+
+ case DXGI_FORMAT_R32G32_FLOAT:
+ LOAD_SCANLINE2( XMFLOAT2, XMLoadFloat2, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_R32G32_UINT:
+ LOAD_SCANLINE2( XMUINT2, XMLoadUInt2, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_R32G32_SINT:
+ LOAD_SCANLINE2( XMINT2, XMLoadSInt2, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_D32_FLOAT_S8X24_UINT:
+ if ( size >= (sizeof(float)+sizeof(uint32_t)) )
+ {
+ const float * sPtr = reinterpret_cast<const float*>(pSource);
+ for( size_t icount = 0; icount < size; icount += (sizeof(float)+sizeof(uint32_t)) )
+ {
+ const uint8_t* ps8 = reinterpret_cast<const uint8_t*>( &sPtr[1] );
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSet( sPtr[0], static_cast<float>( *ps8 ), 0.f, 1.f );
+ sPtr += 2;
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_R10G10B10A2_UNORM:
+ case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM:
+ LOAD_SCANLINE( XMUDECN4, XMLoadUDecN4 );
+
+ case DXGI_FORMAT_R10G10B10A2_UINT:
+ LOAD_SCANLINE( XMUDEC4, XMLoadUDec4 );
+
+ case DXGI_FORMAT_R11G11B10_FLOAT:
+ LOAD_SCANLINE3( XMFLOAT3PK, XMLoadFloat3PK, g_XMIdentityR3 );
+
+ case DXGI_FORMAT_R8G8B8A8_UNORM:
+ case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
+ LOAD_SCANLINE( XMUBYTEN4, XMLoadUByteN4 )
+
+ case DXGI_FORMAT_R8G8B8A8_UINT:
+ LOAD_SCANLINE( XMUBYTE4, XMLoadUByte4 )
+
+ case DXGI_FORMAT_R8G8B8A8_SNORM:
+ LOAD_SCANLINE( XMBYTEN4, XMLoadByteN4 )
+
+ case DXGI_FORMAT_R8G8B8A8_SINT:
+ LOAD_SCANLINE( XMBYTE4, XMLoadByte4 )
+
+ case DXGI_FORMAT_R16G16_FLOAT:
+ LOAD_SCANLINE2( XMHALF2, XMLoadHalf2, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_R16G16_UNORM:
+ LOAD_SCANLINE2( XMUSHORTN2, XMLoadUShortN2, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_R16G16_UINT:
+ LOAD_SCANLINE2( XMUSHORT2, XMLoadUShort2, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_R16G16_SNORM:
+ LOAD_SCANLINE2( XMSHORTN2, XMLoadShortN2, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_R16G16_SINT:
+ LOAD_SCANLINE2( XMSHORT2, XMLoadShort2, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_D32_FLOAT:
+ case DXGI_FORMAT_R32_FLOAT:
+ if ( size >= sizeof(float) )
+ {
+ const float* __restrict sPtr = reinterpret_cast<const float*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(float) )
+ {
+ XMVECTOR v = XMLoadFloat( sPtr++ );
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1000 );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_R32_UINT:
+ if ( size >= sizeof(uint32_t) )
+ {
+ const uint32_t* __restrict sPtr = reinterpret_cast<const uint32_t*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint32_t) )
+ {
+ XMVECTOR v = XMLoadInt( sPtr++ );
+ v = XMConvertVectorUIntToFloat( v, 0 );
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1000 );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_R32_SINT:
+ if ( size >= sizeof(int32_t) )
+ {
+ const int32_t * __restrict sPtr = reinterpret_cast<const int32_t*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(int32_t) )
+ {
+ XMVECTOR v = XMLoadInt( reinterpret_cast<const uint32_t*> (sPtr++) );
+ v = XMConvertVectorIntToFloat( v, 0 );
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1000 );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_D24_UNORM_S8_UINT:
+ if ( size >= sizeof(uint32_t) )
+ {
+ const uint32_t * sPtr = reinterpret_cast<const uint32_t*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint32_t) )
+ {
+ float d = static_cast<float>( *sPtr & 0xFFFFFF ) / 16777215.f;
+ float s = static_cast<float>( ( *sPtr & 0xFF000000 ) >> 24 );
+ ++sPtr;
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSet( d, s, 0.f, 1.f );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_R8G8_UNORM:
+ LOAD_SCANLINE2( XMUBYTEN2, XMLoadUByteN2, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_R8G8_UINT:
+ LOAD_SCANLINE2( XMUBYTE2, XMLoadUByte2, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_R8G8_SNORM:
+ LOAD_SCANLINE2( XMBYTEN2, XMLoadByteN2, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_R8G8_SINT:
+ LOAD_SCANLINE2( XMBYTE2, XMLoadByte2, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_R16_FLOAT:
+ if ( size >= sizeof(HALF) )
+ {
+ const HALF * __restrict sPtr = reinterpret_cast<const HALF*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(HALF) )
+ {
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSet( XMConvertHalfToFloat(*sPtr++), 0.f, 0.f, 1.f );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_D16_UNORM:
+ case DXGI_FORMAT_R16_UNORM:
+ if ( size >= sizeof(uint16_t) )
+ {
+ const uint16_t* __restrict sPtr = reinterpret_cast<const uint16_t*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint16_t) )
+ {
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSet( static_cast<float>(*sPtr++) / 65535.f, 0.f, 0.f, 1.f );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_R16_UINT:
+ if ( size >= sizeof(uint16_t) )
+ {
+ const uint16_t * __restrict sPtr = reinterpret_cast<const uint16_t*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint16_t) )
+ {
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSet( static_cast<float>(*sPtr++), 0.f, 0.f, 1.f );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_R16_SNORM:
+ if ( size >= sizeof(int16_t) )
+ {
+ const int16_t * __restrict sPtr = reinterpret_cast<const int16_t*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(int16_t) )
+ {
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSet( static_cast<float>(*sPtr++) / 32767.f, 0.f, 0.f, 1.f );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_R16_SINT:
+ if ( size >= sizeof(int16_t) )
+ {
+ const int16_t * __restrict sPtr = reinterpret_cast<const int16_t*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(int16_t) )
+ {
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSet( static_cast<float>(*sPtr++), 0.f, 0.f, 1.f );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_R8_UNORM:
+ if ( size >= sizeof(uint8_t) )
+ {
+ const uint8_t * __restrict sPtr = reinterpret_cast<const uint8_t*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) )
+ {
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSet( static_cast<float>(*sPtr++) / 255.f, 0.f, 0.f, 1.f );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_R8_UINT:
+ if ( size >= sizeof(uint8_t) )
+ {
+ const uint8_t * __restrict sPtr = reinterpret_cast<const uint8_t*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) )
+ {
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSet( static_cast<float>(*sPtr++), 0.f, 0.f, 1.f );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_R8_SNORM:
+ if ( size >= sizeof(char) )
+ {
+ const char * __restrict sPtr = reinterpret_cast<const char*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(char) )
+ {
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSet( static_cast<float>(*sPtr++) / 127.f, 0.f, 0.f, 1.f );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_R8_SINT:
+ if ( size >= sizeof(char) )
+ {
+ const char * __restrict sPtr = reinterpret_cast<const char*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(char) )
+ {
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSet( static_cast<float>(*sPtr++), 0.f, 0.f, 1.f );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_A8_UNORM:
+ if ( size >= sizeof(uint8_t) )
+ {
+ const uint8_t * __restrict sPtr = reinterpret_cast<const uint8_t*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) )
+ {
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSet( 0.f, 0.f, 0.f, static_cast<float>(*sPtr++) / 255.f );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_R1_UNORM:
+ if ( size >= sizeof(uint8_t) )
+ {
+ const uint8_t * __restrict sPtr = reinterpret_cast<const uint8_t*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) )
+ {
+ for( size_t bcount = 0; bcount < 8; ++bcount )
+ {
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSet( (((*sPtr >> bcount) & 0x1) ? 1.f : 0.f), 0.f, 0.f, 1.f );
+ }
+
+ ++sPtr;
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_R9G9B9E5_SHAREDEXP:
+ LOAD_SCANLINE3( XMFLOAT3SE, XMLoadFloat3SE, g_XMIdentityR3 )
+
+ case DXGI_FORMAT_R8G8_B8G8_UNORM:
+ if ( size >= sizeof(XMUBYTEN4) )
+ {
+ const XMUBYTEN4 * __restrict sPtr = reinterpret_cast<const XMUBYTEN4*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) )
+ {
+ XMVECTOR v = XMLoadUByteN4( sPtr++ );
+ XMVECTOR v1 = XMVectorSwizzle<0, 3, 2, 1>( v );
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1110 );
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v1, g_XMSelect1110 );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_G8R8_G8B8_UNORM:
+ if ( size >= sizeof(XMUBYTEN4) )
+ {
+ const XMUBYTEN4 * __restrict sPtr = reinterpret_cast<const XMUBYTEN4*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) )
+ {
+ XMVECTOR v = XMLoadUByteN4( sPtr++ );
+ XMVECTOR v0 = XMVectorSwizzle<1, 0, 3, 2>( v );
+ XMVECTOR v1 = XMVectorSwizzle<1, 2, 3, 0>( v );
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v0, g_XMSelect1110 );
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v1, g_XMSelect1110 );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_B5G6R5_UNORM:
+ if ( size >= sizeof(XMU565) )
+ {
+ static XMVECTORF32 s_Scale = { 1.f/31.f, 1.f/63.f, 1.f/31.f, 1.f };
+ const XMU565 * __restrict sPtr = reinterpret_cast<const XMU565*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(XMU565) )
+ {
+ XMVECTOR v = XMLoadU565( sPtr++ );
+ v = XMVectorMultiply( v, s_Scale );
+ v = XMVectorSwizzle<2, 1, 0, 3>( v );
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1110 );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_B5G5R5A1_UNORM:
+ if ( size >= sizeof(XMU555) )
+ {
+ static XMVECTORF32 s_Scale = { 1.f/31.f, 1.f/31.f, 1.f/31.f, 1.f };
+ const XMU555 * __restrict sPtr = reinterpret_cast<const XMU555*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(XMU555) )
+ {
+ XMVECTOR v = XMLoadU555( sPtr++ );
+ v = XMVectorMultiply( v, s_Scale );
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSwizzle<2, 1, 0, 3>( v );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_B8G8R8A8_UNORM:
+ case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+ if ( size >= sizeof(XMUBYTEN4) )
+ {
+ const XMUBYTEN4 * __restrict sPtr = reinterpret_cast<const XMUBYTEN4*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) )
+ {
+ XMVECTOR v = XMLoadUByteN4( sPtr++ );
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSwizzle<2, 1, 0, 3>( v );
+ }
+ return true;
+ }
+ return false;
+
+ case DXGI_FORMAT_B8G8R8X8_UNORM:
+ case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
+ if ( size >= sizeof(XMUBYTEN4) )
+ {
+ const XMUBYTEN4 * __restrict sPtr = reinterpret_cast<const XMUBYTEN4*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) )
+ {
+ XMVECTOR v = XMLoadUByteN4( sPtr++ );
+ v = XMVectorSwizzle<2, 1, 0, 3>( v );
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1110 );
+ }
+ return true;
+ }
+ return false;
+
+#ifdef DXGI_1_2_FORMATS
+ case DXGI_FORMAT_B4G4R4A4_UNORM:
+ if ( size >= sizeof(XMUNIBBLE4) )
+ {
+ static XMVECTORF32 s_Scale = { 1.f/15.f, 1.f/15.f, 1.f/15.f, 1.f/15.f };
+ const XMUNIBBLE4 * __restrict sPtr = reinterpret_cast<const XMUNIBBLE4*>(pSource);
+ for( size_t icount = 0; icount < size; icount += sizeof(XMUNIBBLE4) )
+ {
+ XMVECTOR v = XMLoadUNibble4( sPtr++ );
+ v = XMVectorMultiply( v, s_Scale );
+ if ( dPtr >= ePtr ) break;
+ *(dPtr++) = XMVectorSwizzle<2, 1, 0, 3>( v );
+ }
+ return true;
+ }
+ return false;
+
+ // we don't support the video formats ( see IsVideo function )
+#endif // DXGI_1_2_FORMATS
+
+ default:
+ return false;
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+// Stores an image row from standard RGBA XMVECTOR (aligned) array
+//-------------------------------------------------------------------------------------
+#define STORE_SCANLINE( type, func )\
+ if ( size >= sizeof(type) )\
+ {\
+ type * __restrict dPtr = reinterpret_cast<type*>(pDestination);\
+ for( size_t icount = 0; icount < size; icount += sizeof(type) )\
+ {\
+ if ( sPtr >= ePtr ) break;\
+ func( dPtr++, *sPtr++ );\
+ }\
+ }\
+ return true;
+
+bool _StoreScanline( LPVOID pDestination, size_t size, DXGI_FORMAT format,
+ const XMVECTOR* pSource, size_t count )
+{
+ assert( pDestination && size > 0 );
+ assert( pSource && count > 0 && (((uintptr_t)pSource & 0xF) == 0) );
+ assert( IsValid(format) && !IsVideo(format) && !IsTypeless(format) && !IsCompressed(format) );
+
+ const XMVECTOR* __restrict sPtr = pSource;
+ if ( !sPtr )
+ return false;
+
+ const XMVECTOR* ePtr = pSource + count;
+
+ switch( format )
+ {
+ case DXGI_FORMAT_R32G32B32A32_FLOAT:
+ STORE_SCANLINE( XMFLOAT4, XMStoreFloat4 )
+
+ case DXGI_FORMAT_R32G32B32A32_UINT:
+ STORE_SCANLINE( XMUINT4, XMStoreUInt4 )
+
+ case DXGI_FORMAT_R32G32B32A32_SINT:
+ STORE_SCANLINE( XMINT4, XMStoreSInt4 )
+
+ case DXGI_FORMAT_R32G32B32_FLOAT:
+ STORE_SCANLINE( XMFLOAT3, XMStoreFloat3 )
+
+ case DXGI_FORMAT_R32G32B32_UINT:
+ STORE_SCANLINE( XMUINT3, XMStoreUInt3 )
+
+ case DXGI_FORMAT_R32G32B32_SINT:
+ STORE_SCANLINE( XMINT3, XMStoreSInt3 )
+
+ case DXGI_FORMAT_R16G16B16A16_FLOAT:
+ STORE_SCANLINE( XMHALF4, XMStoreHalf4 )
+
+ case DXGI_FORMAT_R16G16B16A16_UNORM:
+ STORE_SCANLINE( XMUSHORTN4, XMStoreUShortN4 )
+
+ case DXGI_FORMAT_R16G16B16A16_UINT:
+ STORE_SCANLINE( XMUSHORT4, XMStoreUShort4 )
+
+ case DXGI_FORMAT_R16G16B16A16_SNORM:
+ STORE_SCANLINE( XMSHORTN4, XMStoreShortN4 )
+
+ case DXGI_FORMAT_R16G16B16A16_SINT:
+ STORE_SCANLINE( XMSHORT4, XMStoreShort4 )
+
+ case DXGI_FORMAT_R32G32_FLOAT:
+ STORE_SCANLINE( XMFLOAT2, XMStoreFloat2 )
+
+ case DXGI_FORMAT_R32G32_UINT:
+ STORE_SCANLINE( XMUINT2, XMStoreUInt2 )
+
+ case DXGI_FORMAT_R32G32_SINT:
+ STORE_SCANLINE( XMINT2, XMStoreSInt2 )
+
+ case DXGI_FORMAT_D32_FLOAT_S8X24_UINT:
+ if ( size >= (sizeof(float)+sizeof(uint32_t)) )
+ {
+ float *dPtr = reinterpret_cast<float*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += (sizeof(float)+sizeof(uint32_t)) )
+ {
+ if ( sPtr >= ePtr ) break;
+ XMFLOAT4 f;
+ XMStoreFloat4( &f, *sPtr++ );
+ dPtr[0] = f.x;
+ uint8_t* ps8 = reinterpret_cast<uint8_t*>( &dPtr[1] );
+ ps8[0] = static_cast<uint8_t>( std::min<float>( 255.f, std::max<float>( 0.f, f.y ) ) );
+ ps8[1] = ps8[2] = ps8[3] = 0;
+ dPtr += 2;
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_R10G10B10A2_UNORM:
+ case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM:
+ STORE_SCANLINE( XMUDECN4, XMStoreUDecN4 );
+
+ case DXGI_FORMAT_R10G10B10A2_UINT:
+ STORE_SCANLINE( XMUDEC4, XMStoreUDec4 );
+
+ case DXGI_FORMAT_R11G11B10_FLOAT:
+ STORE_SCANLINE( XMFLOAT3PK, XMStoreFloat3PK );
+
+ case DXGI_FORMAT_R8G8B8A8_UNORM:
+ case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
+ STORE_SCANLINE( XMUBYTEN4, XMStoreUByteN4 )
+
+ case DXGI_FORMAT_R8G8B8A8_UINT:
+ STORE_SCANLINE( XMUBYTE4, XMStoreUByte4 )
+
+ case DXGI_FORMAT_R8G8B8A8_SNORM:
+ STORE_SCANLINE( XMBYTEN4, XMStoreByteN4 )
+
+ case DXGI_FORMAT_R8G8B8A8_SINT:
+ STORE_SCANLINE( XMBYTE4, XMStoreByte4 )
+
+ case DXGI_FORMAT_R16G16_FLOAT:
+ STORE_SCANLINE( XMHALF2, XMStoreHalf2 )
+
+ case DXGI_FORMAT_R16G16_UNORM:
+ STORE_SCANLINE( XMUSHORTN2, XMStoreUShortN2 )
+
+ case DXGI_FORMAT_R16G16_UINT:
+ STORE_SCANLINE( XMUSHORT2, XMStoreUShort2 )
+
+ case DXGI_FORMAT_R16G16_SNORM:
+ STORE_SCANLINE( XMSHORTN2, XMStoreShortN2 )
+
+ case DXGI_FORMAT_R16G16_SINT:
+ STORE_SCANLINE( XMSHORT2, XMStoreShort2 )
+
+ case DXGI_FORMAT_D32_FLOAT:
+ case DXGI_FORMAT_R32_FLOAT:
+ if ( size >= sizeof(float) )
+ {
+ float * __restrict dPtr = reinterpret_cast<float*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(float) )
+ {
+ if ( sPtr >= ePtr ) break;
+ XMStoreFloat( dPtr++, *(sPtr++) );
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_R32_UINT:
+ if ( size >= sizeof(uint32_t) )
+ {
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint32_t) )
+ {
+ if ( sPtr >= ePtr ) break;
+ XMVECTOR v = XMConvertVectorFloatToUInt( *(sPtr++), 0 );
+ XMStoreInt( dPtr++, v );
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_R32_SINT:
+ if ( size >= sizeof(uint32_t) )
+ {
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint32_t) )
+ {
+ if ( sPtr >= ePtr ) break;
+ XMVECTOR v = XMConvertVectorFloatToInt( *(sPtr++), 0 );
+ XMStoreInt( dPtr++, v );
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_D24_UNORM_S8_UINT:
+ if ( size >= sizeof(uint32_t) )
+ {
+ static const XMVECTORF32 clamp = { 1.f, 255.f, 0.f, 0.f };
+ XMVECTOR zero = XMVectorZero();
+ uint32_t *dPtr = reinterpret_cast<uint32_t*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint32_t) )
+ {
+ if ( sPtr >= ePtr ) break;
+ XMFLOAT4 f;
+ XMStoreFloat4( &f, XMVectorClamp( *sPtr++, zero, clamp ) );
+ *dPtr++ = (static_cast<uint32_t>( f.x * 16777215.f ) & 0xFFFFFF)
+ | ((static_cast<uint32_t>( f.y ) & 0xFF) << 24);
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_R8G8_UNORM:
+ STORE_SCANLINE( XMUBYTEN2, XMStoreUByteN2 )
+
+ case DXGI_FORMAT_R8G8_UINT:
+ STORE_SCANLINE( XMUBYTE2, XMStoreUByte2 )
+
+ case DXGI_FORMAT_R8G8_SNORM:
+ STORE_SCANLINE( XMBYTEN2, XMStoreByteN2 )
+
+ case DXGI_FORMAT_R8G8_SINT:
+ STORE_SCANLINE( XMBYTE2, XMStoreByte2 )
+
+ case DXGI_FORMAT_R16_FLOAT:
+ if ( size >= sizeof(HALF) )
+ {
+ HALF * __restrict dPtr = reinterpret_cast<HALF*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(HALF) )
+ {
+ if ( sPtr >= ePtr ) break;
+ float v = XMVectorGetX( *sPtr++ );
+ *(dPtr++) = XMConvertFloatToHalf(v);
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_D16_UNORM:
+ case DXGI_FORMAT_R16_UNORM:
+ if ( size >= sizeof(int16_t) )
+ {
+ int16_t * __restrict dPtr = reinterpret_cast<int16_t*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(int16_t) )
+ {
+ if ( sPtr >= ePtr ) break;
+ float v = XMVectorGetX( *sPtr++ );
+ v = std::max<float>( std::min<float>( v, 1.f ), 0.f );
+ *(dPtr++) = static_cast<uint16_t>( v*65535.f + 0.5f );
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_R16_UINT:
+ if ( size >= sizeof(uint16_t) )
+ {
+ uint16_t * __restrict dPtr = reinterpret_cast<uint16_t*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint16_t) )
+ {
+ if ( sPtr >= ePtr ) break;
+ float v = XMVectorGetX( *sPtr++ );
+ v = std::max<float>( std::min<float>( v, 65535.f ), 0.f );
+ *(dPtr++) = static_cast<uint16_t>(v);
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_R16_SNORM:
+ if ( size >= sizeof(int16_t) )
+ {
+ int16_t * __restrict dPtr = reinterpret_cast<int16_t*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(int16_t) )
+ {
+ if ( sPtr >= ePtr ) break;
+ float v = XMVectorGetX( *sPtr++ );
+ v = std::max<float>( std::min<float>( v, 1.f ), -1.f );
+ *(dPtr++) = static_cast<uint16_t>( v * 32767.f );
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_R16_SINT:
+ if ( size >= sizeof(int16_t) )
+ {
+ int16_t * __restrict dPtr = reinterpret_cast<int16_t*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(int16_t) )
+ {
+ if ( sPtr >= ePtr ) break;
+ float v = XMVectorGetX( *sPtr++ );
+ v = std::max<float>( std::min<float>( v, 32767.f ), -32767.f );
+ *(dPtr++) = static_cast<int16_t>(v);
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_R8_UNORM:
+ if ( size >= sizeof(uint8_t) )
+ {
+ uint8_t * __restrict dPtr = reinterpret_cast<uint8_t*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) )
+ {
+ if ( sPtr >= ePtr ) break;
+ float v = XMVectorGetX( *sPtr++ );
+ v = std::max<float>( std::min<float>( v, 1.f ), 0.f );
+ *(dPtr++) = static_cast<uint8_t>( v * 255.f);
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_R8_UINT:
+ if ( size >= sizeof(uint8_t) )
+ {
+ uint8_t * __restrict dPtr = reinterpret_cast<uint8_t*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) )
+ {
+ if ( sPtr >= ePtr ) break;
+ float v = XMVectorGetX( *sPtr++ );
+ v = std::max<float>( std::min<float>( v, 255.f ), 0.f );
+ *(dPtr++) = static_cast<uint8_t>(v);
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_R8_SNORM:
+ if ( size >= sizeof(char) )
+ {
+ char * __restrict dPtr = reinterpret_cast<char*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(char) )
+ {
+ if ( sPtr >= ePtr ) break;
+ float v = XMVectorGetX( *sPtr++ );
+ v = std::max<float>( std::min<float>( v, 1.f ), -1.f );
+ *(dPtr++) = static_cast<char>( v * 127.f );
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_R8_SINT:
+ if ( size >= sizeof(char) )
+ {
+ char * __restrict dPtr = reinterpret_cast<char*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(char) )
+ {
+ if ( sPtr >= ePtr ) break;
+ float v = XMVectorGetX( *sPtr++ );
+ v = std::max<float>( std::min<float>( v, 127.f ), -127.f );
+ *(dPtr++) = static_cast<char>( v );
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_A8_UNORM:
+ if ( size >= sizeof(uint8_t) )
+ {
+ uint8_t * __restrict dPtr = reinterpret_cast<uint8_t*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) )
+ {
+ if ( sPtr >= ePtr ) break;
+ float v = XMVectorGetW( *sPtr++ );
+ v = std::max<float>( std::min<float>( v, 1.f ), 0.f );
+ *(dPtr++) = static_cast<uint8_t>( v * 255.f);
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_R1_UNORM:
+ if ( size >= sizeof(uint8_t) )
+ {
+ uint8_t * __restrict dPtr = reinterpret_cast<uint8_t*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(uint8_t) )
+ {
+ uint8_t pixels = 0;
+ for( size_t bcount = 0; bcount < 8; ++bcount )
+ {
+ if ( sPtr >= ePtr ) break;
+ float v = XMVectorGetX( *sPtr++ );
+ if ( v > 0.5f )
+ pixels |= 1 << bcount;
+ }
+ *(dPtr++) = pixels;
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_R9G9B9E5_SHAREDEXP:
+ STORE_SCANLINE( XMFLOAT3SE, XMStoreFloat3SE )
+
+ case DXGI_FORMAT_R8G8_B8G8_UNORM:
+ if ( size >= sizeof(XMUBYTEN4) )
+ {
+ XMUBYTEN4 * __restrict dPtr = reinterpret_cast<XMUBYTEN4*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) )
+ {
+ if ( sPtr >= ePtr ) break;
+ XMVECTOR v0 = *sPtr++;
+ XMVECTOR v1 = (sPtr < ePtr) ? XMVectorSplatY( *sPtr++ ) : XMVectorZero();
+ XMVECTOR v = XMVectorSelect( v1, v0, g_XMSelect1110 );
+ XMStoreUByteN4( dPtr++, v );
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_G8R8_G8B8_UNORM:
+ if ( size >= sizeof(XMUBYTEN4) )
+ {
+ static XMVECTORI32 select1101 = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1};
+
+ XMUBYTEN4 * __restrict dPtr = reinterpret_cast<XMUBYTEN4*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) )
+ {
+ if ( sPtr >= ePtr ) break;
+ XMVECTOR v0 = XMVectorSwizzle<1, 0, 3, 2>( *sPtr++ );
+ XMVECTOR v1 = (sPtr < ePtr) ? XMVectorSplatY( *sPtr++ ) : XMVectorZero();
+ XMVECTOR v = XMVectorSelect( v1, v0, select1101 );
+ XMStoreUByteN4( dPtr++, v );
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_B5G6R5_UNORM:
+ if ( size >= sizeof(XMU565) )
+ {
+ static XMVECTORF32 s_Scale = { 31.f, 63.f, 31.f, 1.f };
+ XMU565 * __restrict dPtr = reinterpret_cast<XMU565*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(XMU565) )
+ {
+ if ( sPtr >= ePtr ) break;
+ XMVECTOR v = XMVectorSwizzle<2, 1, 0, 3>( *sPtr++ );
+ v = XMVectorMultiply( v, s_Scale );
+ XMStoreU565( dPtr++, v );
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_B5G5R5A1_UNORM:
+ if ( size >= sizeof(XMU555) )
+ {
+ static XMVECTORF32 s_Scale = { 31.f, 31.f, 31.f, 1.f };
+ XMU555 * __restrict dPtr = reinterpret_cast<XMU555*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(XMU555) )
+ {
+ if ( sPtr >= ePtr ) break;
+ XMVECTOR v = XMVectorSwizzle<2, 1, 0, 3>( *sPtr++ );
+ v = XMVectorMultiply( v, s_Scale );
+ XMStoreU555( dPtr++, v );
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_B8G8R8A8_UNORM:
+ case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+ if ( size >= sizeof(XMUBYTEN4) )
+ {
+ XMUBYTEN4 * __restrict dPtr = reinterpret_cast<XMUBYTEN4*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) )
+ {
+ if ( sPtr >= ePtr ) break;
+ XMVECTOR v = XMVectorSwizzle<2, 1, 0, 3>( *sPtr++ );
+ XMStoreUByteN4( dPtr++, v );
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_B8G8R8X8_UNORM:
+ case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
+ if ( size >= sizeof(XMUBYTEN4) )
+ {
+ XMUBYTEN4 * __restrict dPtr = reinterpret_cast<XMUBYTEN4*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(XMUBYTEN4) )
+ {
+ if ( sPtr >= ePtr ) break;
+ XMVECTOR v = XMVectorPermute<2, 1, 0, 7>( *sPtr++, g_XMIdentityR3 );
+ XMStoreUByteN4( dPtr++, v );
+ }
+ }
+ return true;
+
+#ifdef DXGI_1_2_FORMATS
+ case DXGI_FORMAT_B4G4R4A4_UNORM:
+ if ( size >= sizeof(XMUNIBBLE4) )
+ {
+ static XMVECTORF32 s_Scale = { 15.f, 15.f, 15.f, 15.f };
+ XMUNIBBLE4 * __restrict dPtr = reinterpret_cast<XMUNIBBLE4*>(pDestination);
+ for( size_t icount = 0; icount < size; icount += sizeof(XMUNIBBLE4) )
+ {
+ if ( sPtr >= ePtr ) break;
+ XMVECTOR v = XMVectorSwizzle<2, 1, 0, 3>( *sPtr++ );
+ v = XMVectorMultiply( v, s_Scale );
+ XMStoreUNibble4( dPtr++, v );
+ }
+ }
+ return true;
+
+ // We don't support the video formats ( see IsVideo function )
+#endif // DXGI_1_2_FORMATS
+
+ default:
+ return false;
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+// Convert DXGI image to/from GUID_WICPixelFormat128bppRGBAFloat (no range conversions)
+//-------------------------------------------------------------------------------------
+HRESULT _ConvertToR32G32B32A32( const Image& srcImage, ScratchImage& image )
+{
+ if ( !srcImage.pixels )
+ return E_POINTER;
+
+ HRESULT hr = image.Initialize2D( DXGI_FORMAT_R32G32B32A32_FLOAT, srcImage.width, srcImage.height, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *img = image.GetImage( 0, 0, 0 );
+ if ( !img )
+ {
+ image.Release();
+ return E_POINTER;
+ }
+
+ uint8_t* pDest = img->pixels;
+ if ( !pDest )
+ {
+ image.Release();
+ return E_POINTER;
+ }
+
+ const uint8_t *pSrc = srcImage.pixels;
+ for( size_t h = 0; h < srcImage.height; ++h )
+ {
+ if ( !_LoadScanline( reinterpret_cast<XMVECTOR*>(pDest), srcImage.width, pSrc, srcImage.rowPitch, srcImage.format ) )
+ {
+ image.Release();
+ return E_FAIL;
+ }
+
+ pSrc += srcImage.rowPitch;
+ pDest += img->rowPitch;
+ }
+
+ return S_OK;
+}
+
+HRESULT _ConvertFromR32G32B32A32( _In_ const Image& srcImage, _In_ const Image& destImage )
+{
+ assert( srcImage.format == DXGI_FORMAT_R32G32B32A32_FLOAT );
+
+ if ( !srcImage.pixels || !destImage.pixels )
+ return E_POINTER;
+
+ if ( srcImage.width != destImage.width || srcImage.height != destImage.height )
+ return E_FAIL;
+
+ const uint8_t *pSrc = srcImage.pixels;
+ uint8_t* pDest = destImage.pixels;
+
+ for( size_t h = 0; h < srcImage.height; ++h )
+ {
+ if ( !_StoreScanline( pDest, destImage.rowPitch, destImage.format, reinterpret_cast<const XMVECTOR*>(pSrc), srcImage.width ) )
+ return E_FAIL;
+
+ pSrc += srcImage.rowPitch;
+ pDest += destImage.rowPitch;
+ }
+
+ return S_OK;
+}
+
+HRESULT _ConvertFromR32G32B32A32( const Image& srcImage, DXGI_FORMAT format, ScratchImage& image )
+{
+ if ( !srcImage.pixels )
+ return E_POINTER;
+
+ HRESULT hr = image.Initialize2D( format, srcImage.width, srcImage.height, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *img = image.GetImage( 0, 0, 0 );
+ if ( !img )
+ {
+ image.Release();
+ return E_POINTER;
+ }
+
+ hr = _ConvertFromR32G32B32A32( srcImage, *img );
+ if ( FAILED(hr) )
+ {
+ image.Release();
+ return hr;
+ }
+
+ return S_OK;
+}
+
+HRESULT _ConvertFromR32G32B32A32( const Image* srcImages, size_t nimages, const TexMetadata& metadata, DXGI_FORMAT format, ScratchImage& result )
+{
+ if ( !srcImages )
+ return E_POINTER;
+
+ result.Release();
+
+ assert( metadata.format == DXGI_FORMAT_R32G32B32A32_FLOAT );
+
+ TexMetadata mdata2 = metadata;
+ mdata2.format = format;
+ HRESULT hr = result.Initialize( mdata2 );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( nimages != result.GetImageCount() )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+
+ const Image* dest = result.GetImages();
+ if ( !dest )
+ {
+ result.Release();
+ return E_POINTER;
+ }
+
+ for( size_t index=0; index < nimages; ++index )
+ {
+ const Image& src = srcImages[ index ];
+ const Image& dst = dest[ index ];
+
+ assert( src.format == DXGI_FORMAT_R32G32B32A32_FLOAT );
+ assert( dst.format == format );
+
+ if ( src.width != dst.width || src.height != dst.height )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+
+ const uint8_t* pSrc = src.pixels;
+ uint8_t* pDest = dst.pixels;
+ if ( !pSrc || !pDest )
+ {
+ result.Release();
+ return E_POINTER;
+ }
+
+ for( size_t h=0; h < src.height; ++h )
+ {
+ if ( !_StoreScanline( pDest, dst.rowPitch, format, reinterpret_cast<const XMVECTOR*>(pSrc), src.width ) )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+
+ pSrc += src.rowPitch;
+ pDest += dst.rowPitch;
+ }
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// RGB -> sRGB
+//-------------------------------------------------------------------------------------
+static const uint32_t g_fEncodeGamma22[] =
+{
+ 0x00000000, 0x3bd56bd3, 0x3c486344, 0x3c90da15, 0x3cbc2677, 0x3ce67704, 0x3d080183, 0x3d1c7728,
+ 0x3d30a8fb, 0x3d44a03c, 0x3d586400, 0x3d6bf9e7, 0x3d7f6679, 0x3d8956bd, 0x3d92e906, 0x3d9c6b70,
+ 0x3da5df22, 0x3daf451b, 0x3db89e3e, 0x3dc1eb50, 0x3dcb2d04, 0x3dd463f7, 0x3ddd90b9, 0x3de6b3ca,
+ 0x3defcda0, 0x3df8dea6, 0x3e00f3a0, 0x3e0573e3, 0x3e09f046, 0x3e0e68f0, 0x3e12de06, 0x3e174fa6,
+ 0x3e1bbdf2, 0x3e202906, 0x3e2490fd, 0x3e28f5f1, 0x3e2d57fb, 0x3e31b72f, 0x3e3613a4, 0x3e3a6d6e,
+ 0x3e3ec4a0, 0x3e43194d, 0x3e476b84, 0x3e4bbb57, 0x3e5008d7, 0x3e54540f, 0x3e589d0f, 0x3e5ce3e5,
+ 0x3e61289d, 0x3e656b44, 0x3e69abe5, 0x3e6dea8d, 0x3e722745, 0x3e766217, 0x3e7a9b0e, 0x3e7ed235,
+ 0x3e8183c9, 0x3e839d98, 0x3e85b68c, 0x3e87cea8, 0x3e89e5f2, 0x3e8bfc6b, 0x3e8e1219, 0x3e9026ff,
+ 0x3e923b20, 0x3e944e7f, 0x3e966120, 0x3e987307, 0x3e9a8436, 0x3e9c94af, 0x3e9ea476, 0x3ea0b38e,
+ 0x3ea2c1fb, 0x3ea4cfbb, 0x3ea6dcd5, 0x3ea8e94a, 0x3eaaf51c, 0x3ead004e, 0x3eaf0ae2, 0x3eb114d9,
+ 0x3eb31e37, 0x3eb526fe, 0x3eb72f2f, 0x3eb936cd, 0x3ebb3dd8, 0x3ebd4454, 0x3ebf4a43, 0x3ec14fa5,
+ 0x3ec3547e, 0x3ec558cd, 0x3ec75c95, 0x3ec95fd8, 0x3ecb6297, 0x3ecd64d4, 0x3ecf6690, 0x3ed167ce,
+ 0x3ed3688e, 0x3ed568d1, 0x3ed76899, 0x3ed967e9, 0x3edb66bf, 0x3edd651f, 0x3edf630a, 0x3ee16080,
+ 0x3ee35d84, 0x3ee55a16, 0x3ee75636, 0x3ee951e8, 0x3eeb4d2a, 0x3eed4800, 0x3eef4269, 0x3ef13c68,
+ 0x3ef335fc, 0x3ef52f26, 0x3ef727ea, 0x3ef92046, 0x3efb183c, 0x3efd0fcd, 0x3eff06fa, 0x3f007ee2,
+ 0x3f017a16, 0x3f027519, 0x3f036fec, 0x3f046a8f, 0x3f056502, 0x3f065f47, 0x3f07595d, 0x3f085344,
+ 0x3f094cfe, 0x3f0a468b, 0x3f0b3feb, 0x3f0c391e, 0x3f0d3224, 0x3f0e2aff, 0x3f0f23af, 0x3f101c32,
+ 0x3f11148c, 0x3f120cba, 0x3f1304bf, 0x3f13fc9a, 0x3f14f44b, 0x3f15ebd3, 0x3f16e333, 0x3f17da6b,
+ 0x3f18d17a, 0x3f19c860, 0x3f1abf1f, 0x3f1bb5b7, 0x3f1cac28, 0x3f1da272, 0x3f1e9895, 0x3f1f8e92,
+ 0x3f20846a, 0x3f217a1c, 0x3f226fa8, 0x3f23650f, 0x3f245a52, 0x3f254f70, 0x3f264469, 0x3f27393f,
+ 0x3f282df1, 0x3f29227f, 0x3f2a16ea, 0x3f2b0b31, 0x3f2bff56, 0x3f2cf358, 0x3f2de738, 0x3f2edaf6,
+ 0x3f2fce91, 0x3f30c20b, 0x3f31b564, 0x3f32a89b, 0x3f339bb1, 0x3f348ea6, 0x3f35817a, 0x3f36742f,
+ 0x3f3766c3, 0x3f385936, 0x3f394b8a, 0x3f3a3dbe, 0x3f3b2fd3, 0x3f3c21c8, 0x3f3d139e, 0x3f3e0556,
+ 0x3f3ef6ee, 0x3f3fe868, 0x3f40d9c4, 0x3f41cb01, 0x3f42bc20, 0x3f43ad22, 0x3f449e06, 0x3f458ecc,
+ 0x3f467f75, 0x3f477001, 0x3f486071, 0x3f4950c2, 0x3f4a40f8, 0x3f4b3111, 0x3f4c210d, 0x3f4d10ed,
+ 0x3f4e00b2, 0x3f4ef05a, 0x3f4fdfe7, 0x3f50cf58, 0x3f51beae, 0x3f52ade8, 0x3f539d07, 0x3f548c0c,
+ 0x3f557af5, 0x3f5669c4, 0x3f575878, 0x3f584711, 0x3f593590, 0x3f5a23f6, 0x3f5b1241, 0x3f5c0072,
+ 0x3f5cee89, 0x3f5ddc87, 0x3f5eca6b, 0x3f5fb835, 0x3f60a5e7, 0x3f619380, 0x3f6280ff, 0x3f636e65,
+ 0x3f645bb3, 0x3f6548e8, 0x3f663604, 0x3f672309, 0x3f680ff4, 0x3f68fcc8, 0x3f69e983, 0x3f6ad627,
+ 0x3f6bc2b3, 0x3f6caf27, 0x3f6d9b83, 0x3f6e87c8, 0x3f6f73f5, 0x3f70600c, 0x3f714c0b, 0x3f7237f4,
+ 0x3f7323c4, 0x3f740f7f, 0x3f74fb22, 0x3f75e6af, 0x3f76d225, 0x3f77bd85, 0x3f78a8ce, 0x3f799401,
+ 0x3f7a7f1e, 0x3f7b6a25, 0x3f7c5516, 0x3f7d3ff1, 0x3f7e2ab6, 0x3f7f1566, 0x3f800000, 0x3f800000
+};
+
+#pragma prefast(suppress : 25000, "FXMVECTOR is 16 bytes")
+static inline XMVECTOR _TableEncodeGamma22( FXMVECTOR v )
+{
+ float f[4];
+ XMStoreFloat4( (XMFLOAT4*)f, v );
+
+ for( size_t i=0; i < 4; ++i )
+ {
+ float f2 = sqrtf(f[i]) * 254.0f;
+
+ uint32_t i2 = static_cast<uint32_t>(f2);
+ i2 = std::min<uint32_t>( i2, _countof( g_fEncodeGamma22 )-2 );
+
+ float fS = f2 - (float) i2;
+ float fA = ((float *) g_fEncodeGamma22)[i2];
+ float fB = ((float *) g_fEncodeGamma22)[i2 + 1];
+
+ f[i] = fA + fS * (fB - fA);
+ }
+
+ return XMLoadFloat4( (XMFLOAT4*)f );
+}
+
+
+//-------------------------------------------------------------------------------------
+// sRGB -> RGB
+//-------------------------------------------------------------------------------------
+static const uint32_t g_fDecodeGamma22[] =
+{
+ 0x00000000, 0x3b144eb0, 0x3b9ef3b0, 0x3bf84b42, 0x3c2a5c46, 0x3c59c180, 0x3c850eb5, 0x3c9da52a,
+ 0x3cb6967a, 0x3ccfd852, 0x3ce9628b, 0x3d01974b, 0x3d0e9b82, 0x3d1bbba3, 0x3d28f5bc, 0x3d364822,
+ 0x3d43b159, 0x3d51301d, 0x3d5ec344, 0x3d6c69c9, 0x3d7a22c4, 0x3d83f6ad, 0x3d8ae465, 0x3d91da35,
+ 0x3d98d7c7, 0x3d9fdcd2, 0x3da6e914, 0x3dadfc47, 0x3db51635, 0x3dbc36a3, 0x3dc35d62, 0x3dca8a3a,
+ 0x3dd1bd02, 0x3dd8f591, 0x3de033bb, 0x3de7775d, 0x3deec050, 0x3df60e74, 0x3dfd61a6, 0x3e025ce5,
+ 0x3e060b61, 0x3e09bc38, 0x3e0d6f5f, 0x3e1124c8, 0x3e14dc68, 0x3e189630, 0x3e1c521a, 0x3e201016,
+ 0x3e23d01d, 0x3e279225, 0x3e2b5624, 0x3e2f1c10, 0x3e32e3e4, 0x3e36ad94, 0x3e3a7918, 0x3e3e4668,
+ 0x3e42157f, 0x3e45e654, 0x3e49b8e0, 0x3e4d8d1d, 0x3e516304, 0x3e553a8d, 0x3e5913b4, 0x3e5cee70,
+ 0x3e60cabf, 0x3e64a89b, 0x3e6887fb, 0x3e6c68db, 0x3e704b3a, 0x3e742f0e, 0x3e781454, 0x3e7bfb04,
+ 0x3e7fe321, 0x3e81e650, 0x3e83dbc0, 0x3e85d1dc, 0x3e87c8a3, 0x3e89c015, 0x3e8bb830, 0x3e8db0ee,
+ 0x3e8faa51, 0x3e91a454, 0x3e939ef9, 0x3e959a3b, 0x3e97961b, 0x3e999295, 0x3e9b8fa7, 0x3e9d8d52,
+ 0x3e9f8b93, 0x3ea18a6a, 0x3ea389d2, 0x3ea589cb, 0x3ea78a56, 0x3ea98b6e, 0x3eab8d15, 0x3ead8f47,
+ 0x3eaf9204, 0x3eb1954a, 0x3eb39917, 0x3eb59d6c, 0x3eb7a246, 0x3eb9a7a5, 0x3ebbad88, 0x3ebdb3ec,
+ 0x3ebfbad3, 0x3ec1c237, 0x3ec3ca1a, 0x3ec5d27c, 0x3ec7db58, 0x3ec9e4b4, 0x3ecbee85, 0x3ecdf8d3,
+ 0x3ed0039a, 0x3ed20ed8, 0x3ed41a8a, 0x3ed626b5, 0x3ed83351, 0x3eda4065, 0x3edc4de9, 0x3ede5be0,
+ 0x3ee06a4a, 0x3ee27923, 0x3ee4886a, 0x3ee69821, 0x3ee8a845, 0x3eeab8d8, 0x3eecc9d6, 0x3eeedb3f,
+ 0x3ef0ed13, 0x3ef2ff53, 0x3ef511fb, 0x3ef7250a, 0x3ef93883, 0x3efb4c61, 0x3efd60a7, 0x3eff7553,
+ 0x3f00c531, 0x3f01cfeb, 0x3f02dad9, 0x3f03e5f5, 0x3f04f145, 0x3f05fcc4, 0x3f070875, 0x3f081456,
+ 0x3f092067, 0x3f0a2ca8, 0x3f0b3917, 0x3f0c45b7, 0x3f0d5284, 0x3f0e5f7f, 0x3f0f6caa, 0x3f107a03,
+ 0x3f118789, 0x3f12953b, 0x3f13a31d, 0x3f14b12b, 0x3f15bf64, 0x3f16cdca, 0x3f17dc5e, 0x3f18eb1b,
+ 0x3f19fa05, 0x3f1b091b, 0x3f1c185c, 0x3f1d27c7, 0x3f1e375c, 0x3f1f471d, 0x3f205707, 0x3f21671b,
+ 0x3f227759, 0x3f2387c2, 0x3f249852, 0x3f25a90c, 0x3f26b9ef, 0x3f27cafb, 0x3f28dc30, 0x3f29ed8b,
+ 0x3f2aff11, 0x3f2c10bd, 0x3f2d2290, 0x3f2e348b, 0x3f2f46ad, 0x3f3058f7, 0x3f316b66, 0x3f327dfd,
+ 0x3f3390ba, 0x3f34a39d, 0x3f35b6a7, 0x3f36c9d6, 0x3f37dd2b, 0x3f38f0a5, 0x3f3a0443, 0x3f3b1808,
+ 0x3f3c2bf2, 0x3f3d4000, 0x3f3e5434, 0x3f3f688c, 0x3f407d07, 0x3f4191a8, 0x3f42a66c, 0x3f43bb54,
+ 0x3f44d05f, 0x3f45e58e, 0x3f46fadf, 0x3f481054, 0x3f4925ed, 0x3f4a3ba8, 0x3f4b5186, 0x3f4c6789,
+ 0x3f4d7daa, 0x3f4e93f0, 0x3f4faa57, 0x3f50c0e0, 0x3f51d78b, 0x3f52ee58, 0x3f540545, 0x3f551c55,
+ 0x3f563386, 0x3f574ad7, 0x3f58624b, 0x3f5979de, 0x3f5a9191, 0x3f5ba965, 0x3f5cc15b, 0x3f5dd971,
+ 0x3f5ef1a6, 0x3f6009fc, 0x3f612272, 0x3f623b08, 0x3f6353bc, 0x3f646c90, 0x3f658586, 0x3f669e98,
+ 0x3f67b7cb, 0x3f68d11b, 0x3f69ea8d, 0x3f6b041b, 0x3f6c1dc9, 0x3f6d3795, 0x3f6e5180, 0x3f6f6b8b,
+ 0x3f7085b2, 0x3f719ff7, 0x3f72ba5b, 0x3f73d4dc, 0x3f74ef7c, 0x3f760a38, 0x3f772512, 0x3f78400b,
+ 0x3f795b20, 0x3f7a7651, 0x3f7b91a2, 0x3f7cad0e, 0x3f7dc896, 0x3f7ee43c, 0x3f800000, 0x3f800000
+};
+
+
+#pragma prefast(suppress : 25000, "FXMVECTOR is 16 bytes")
+static inline XMVECTOR _TableDecodeGamma22( FXMVECTOR v )
+{
+ float f[4];
+ XMStoreFloat4( (XMFLOAT4*)f, v );
+
+ for( size_t i=0; i < 4; ++i )
+ {
+ float f2 = f[i] * f[i] * 254.0f;
+ uint32_t i2 = static_cast<uint32_t>(f2);
+ i2 = std::min<uint32_t>( i2, _countof(g_fDecodeGamma22)-2 );
+
+ float fS = f2 - (float) i2;
+ float fA = ((float *) g_fDecodeGamma22)[i2];
+ float fB = ((float *) g_fDecodeGamma22)[i2 + 1];
+
+ f[i] = fA + fS * (fB - fA);
+ }
+
+ return XMLoadFloat4( (XMFLOAT4*)f );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Convert scanline based on source/target formats
+//-------------------------------------------------------------------------------------
+struct ConvertData
+{
+ DXGI_FORMAT format;
+ size_t datasize;
+ DWORD flags;
+};
+
+static const ConvertData g_ConvertTable[] = {
+ { DXGI_FORMAT_R32G32B32A32_FLOAT, 32, CONVF_FLOAT | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R32G32B32A32_UINT, 32, CONVF_UINT | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R32G32B32A32_SINT, 32, CONVF_SINT | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R32G32B32_FLOAT, 32, CONVF_FLOAT | CONVF_R | CONVF_G | CONVF_B },
+ { DXGI_FORMAT_R32G32B32_UINT, 32, CONVF_UINT | CONVF_R | CONVF_G | CONVF_B },
+ { DXGI_FORMAT_R32G32B32_SINT, 32, CONVF_SINT | CONVF_R | CONVF_G | CONVF_B },
+ { DXGI_FORMAT_R16G16B16A16_FLOAT, 16, CONVF_FLOAT | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R16G16B16A16_UNORM, 16, CONVF_UNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R16G16B16A16_UINT, 16, CONVF_UINT | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R16G16B16A16_SNORM, 16, CONVF_SNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R16G16B16A16_SINT, 16, CONVF_SINT | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R32G32_FLOAT, 32, CONVF_FLOAT | CONVF_R | CONVF_G },
+ { DXGI_FORMAT_R32G32_UINT, 32, CONVF_UINT | CONVF_R | CONVF_G },
+ { DXGI_FORMAT_R32G32_SINT, 32, CONVF_SINT | CONVF_R | CONVF_G },
+ { DXGI_FORMAT_D32_FLOAT_S8X24_UINT, 32, CONVF_FLOAT | CONVF_DEPTH | CONVF_STENCIL },
+ { DXGI_FORMAT_R10G10B10A2_UNORM, 10, CONVF_UNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R10G10B10A2_UINT, 10, CONVF_UINT | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R11G11B10_FLOAT, 10, CONVF_FLOAT | CONVF_R | CONVF_G | CONVF_B },
+ { DXGI_FORMAT_R8G8B8A8_UNORM, 8, CONVF_UNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R8G8B8A8_UNORM_SRGB, 8, CONVF_UNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R8G8B8A8_UINT, 8, CONVF_UINT | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R8G8B8A8_SNORM, 8, CONVF_SNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R8G8B8A8_SINT, 8, CONVF_SINT | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_R16G16_FLOAT, 16, CONVF_FLOAT | CONVF_R | CONVF_G },
+ { DXGI_FORMAT_R16G16_UNORM, 16, CONVF_UNORM | CONVF_R | CONVF_G },
+ { DXGI_FORMAT_R16G16_UINT, 16, CONVF_UINT | CONVF_R | CONVF_G },
+ { DXGI_FORMAT_R16G16_SNORM, 16, CONVF_SNORM | CONVF_R | CONVF_G },
+ { DXGI_FORMAT_R16G16_SINT, 16, CONVF_SINT | CONVF_R | CONVF_G },
+ { DXGI_FORMAT_D32_FLOAT, 32, CONVF_FLOAT | CONVF_DEPTH },
+ { DXGI_FORMAT_R32_FLOAT, 32, CONVF_FLOAT | CONVF_R },
+ { DXGI_FORMAT_R32_UINT, 32, CONVF_UINT | CONVF_R },
+ { DXGI_FORMAT_R32_SINT, 32, CONVF_SINT | CONVF_R },
+ { DXGI_FORMAT_D24_UNORM_S8_UINT, 32, CONVF_UNORM | CONVF_DEPTH | CONVF_STENCIL },
+ { DXGI_FORMAT_R8G8_UNORM, 8, CONVF_UNORM | CONVF_R | CONVF_G },
+ { DXGI_FORMAT_R8G8_UINT, 8, CONVF_UINT | CONVF_R | CONVF_G },
+ { DXGI_FORMAT_R8G8_SNORM, 8, CONVF_SNORM | CONVF_R | CONVF_G },
+ { DXGI_FORMAT_R8G8_SINT, 8, CONVF_SINT | CONVF_R | CONVF_G },
+ { DXGI_FORMAT_R16_FLOAT, 16, CONVF_FLOAT | CONVF_R },
+ { DXGI_FORMAT_D16_UNORM, 16, CONVF_UNORM | CONVF_DEPTH },
+ { DXGI_FORMAT_R16_UNORM, 16, CONVF_UNORM | CONVF_R },
+ { DXGI_FORMAT_R16_UINT, 16, CONVF_UINT | CONVF_R },
+ { DXGI_FORMAT_R16_SNORM, 16, CONVF_SNORM | CONVF_R },
+ { DXGI_FORMAT_R16_SINT, 16, CONVF_SINT | CONVF_R },
+ { DXGI_FORMAT_R8_UNORM, 8, CONVF_UNORM | CONVF_R },
+ { DXGI_FORMAT_R8_UINT, 8, CONVF_UINT | CONVF_R },
+ { DXGI_FORMAT_R8_SNORM, 8, CONVF_SNORM | CONVF_R },
+ { DXGI_FORMAT_R8_SINT, 8, CONVF_SINT | CONVF_R },
+ { DXGI_FORMAT_A8_UNORM, 8, CONVF_UNORM | CONVF_A },
+ { DXGI_FORMAT_R1_UNORM, 1, CONVF_UNORM | CONVF_R },
+ { DXGI_FORMAT_R9G9B9E5_SHAREDEXP, 9, CONVF_SHAREDEXP | CONVF_R | CONVF_G | CONVF_B },
+ { DXGI_FORMAT_R8G8_B8G8_UNORM, 8, CONVF_UNORM | CONVF_PACKED | CONVF_R | CONVF_G | CONVF_B },
+ { DXGI_FORMAT_G8R8_G8B8_UNORM, 8, CONVF_UNORM | CONVF_PACKED | CONVF_R | CONVF_G | CONVF_B },
+ { DXGI_FORMAT_BC1_UNORM, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_BC1_UNORM_SRGB, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_BC2_UNORM, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_BC2_UNORM_SRGB, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_BC3_UNORM, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_BC3_UNORM_SRGB, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_BC4_UNORM, 8, CONVF_UNORM | CONVF_BC | CONVF_R },
+ { DXGI_FORMAT_BC4_SNORM, 8, CONVF_SNORM | CONVF_BC | CONVF_R },
+ { DXGI_FORMAT_BC5_UNORM, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G },
+ { DXGI_FORMAT_BC5_SNORM, 8, CONVF_SNORM | CONVF_BC | CONVF_R | CONVF_G },
+ { DXGI_FORMAT_B5G6R5_UNORM, 5, CONVF_UNORM | CONVF_R | CONVF_G | CONVF_B },
+ { DXGI_FORMAT_B5G5R5A1_UNORM, 5, CONVF_UNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_B8G8R8A8_UNORM, 8, CONVF_UNORM | CONVF_BGR | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_B8G8R8X8_UNORM, 8, CONVF_UNORM | CONVF_BGR | CONVF_R | CONVF_G | CONVF_B },
+ { DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM, 10, CONVF_UNORM | CONVF_X2 | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_B8G8R8A8_UNORM_SRGB, 8, CONVF_UNORM | CONVF_BGR | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_B8G8R8X8_UNORM_SRGB, 8, CONVF_UNORM | CONVF_BGR | CONVF_R | CONVF_G | CONVF_B },
+ { DXGI_FORMAT_BC6H_UF16, 16, CONVF_FLOAT | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_BC6H_SF16, 16, CONVF_FLOAT | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_BC7_UNORM, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+ { DXGI_FORMAT_BC7_UNORM_SRGB, 8, CONVF_UNORM | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+#ifdef DXGI_1_2_FORMATS
+ { DXGI_FORMAT_B4G4R4A4_UNORM, 4, CONVF_UNORM | CONVF_BGR | CONVF_R | CONVF_G | CONVF_B | CONVF_A },
+#endif
+};
+
+#pragma prefast( suppress : 25004, "Signature must match bsearch" );
+static int __cdecl _ConvertCompare( const void* ptr1, const void *ptr2 )
+{
+ const ConvertData *p1 = reinterpret_cast<const ConvertData*>(ptr1);
+ const ConvertData *p2 = reinterpret_cast<const ConvertData*>(ptr2);
+ if ( p1->format == p2->format ) return 0;
+ else return (p1->format < p2->format ) ? -1 : 1;
+}
+
+DWORD _GetConvertFlags( DXGI_FORMAT format )
+{
+#ifdef _DEBUG
+ // Ensure conversion table is in ascending order
+ assert( _countof(g_ConvertTable) > 0 );
+ DXGI_FORMAT lastvalue = g_ConvertTable[0].format;
+ for( size_t index=1; index < _countof(g_ConvertTable); ++index )
+ {
+ assert( g_ConvertTable[index].format > lastvalue );
+ lastvalue = g_ConvertTable[index].format;
+ }
+#endif
+
+ ConvertData key = { format, 0 };
+ const ConvertData* in = (const ConvertData*) bsearch( &key, g_ConvertTable, _countof(g_ConvertTable), sizeof(ConvertData),
+ _ConvertCompare );
+ return (in) ? in->flags : 0;
+}
+
+void _ConvertScanline( XMVECTOR* pBuffer, size_t count, DXGI_FORMAT outFormat, DXGI_FORMAT inFormat, DWORD flags )
+{
+ assert( pBuffer && count > 0 && (((uintptr_t)pBuffer & 0xF) == 0) );
+ assert( IsValid(outFormat) && !IsVideo(outFormat) && !IsTypeless(outFormat) );
+ assert( IsValid(inFormat) && !IsVideo(inFormat) && !IsTypeless(inFormat) );
+
+ if ( !pBuffer )
+ return;
+
+#ifdef _DEBUG
+ // Ensure conversion table is in ascending order
+ assert( _countof(g_ConvertTable) > 0 );
+ DXGI_FORMAT lastvalue = g_ConvertTable[0].format;
+ for( size_t index=1; index < _countof(g_ConvertTable); ++index )
+ {
+ assert( g_ConvertTable[index].format > lastvalue );
+ lastvalue = g_ConvertTable[index].format;
+ }
+#endif
+
+ // Determine conversion details about source and dest formats
+ ConvertData key = { inFormat, 0 };
+ const ConvertData* in = (const ConvertData*) bsearch( &key, g_ConvertTable, _countof(g_ConvertTable), sizeof(ConvertData),
+ _ConvertCompare );
+ key.format = outFormat;
+ const ConvertData* out = (const ConvertData*) bsearch( &key, g_ConvertTable, _countof(g_ConvertTable), sizeof(ConvertData),
+ _ConvertCompare );
+ if ( !in || !out )
+ {
+ assert(false);
+ return;
+ }
+
+ assert( _GetConvertFlags( inFormat ) == in->flags );
+ assert( _GetConvertFlags( outFormat ) == out->flags );
+
+ // Handle SRGB filtering modes
+ if ( IsSRGB( inFormat ) )
+ flags |= TEX_FILTER_SRGB_IN;
+
+ if ( IsSRGB( outFormat ) )
+ flags |= TEX_FILTER_SRGB_OUT;
+
+ if ( in->flags & CONVF_SNORM )
+ flags &= ~TEX_FILTER_SRGB_IN;
+
+ if ( out->flags & CONVF_SNORM )
+ flags &= ~TEX_FILTER_SRGB_OUT;
+
+ if ( (flags & (TEX_FILTER_SRGB_IN|TEX_FILTER_SRGB_OUT)) == (TEX_FILTER_SRGB_IN|TEX_FILTER_SRGB_OUT) )
+ {
+ flags &= ~(TEX_FILTER_SRGB_IN|TEX_FILTER_SRGB_OUT);
+ }
+
+ // sRGB input processing (sRGB -> RGB)
+ if ( flags & TEX_FILTER_SRGB_IN )
+ {
+ if ( (in->flags & CONVF_FLOAT) || (in->flags & CONVF_UNORM) )
+ {
+ XMVECTOR* ptr = pBuffer;
+ for( size_t i=0; i < count; ++i )
+ {
+ // rgb = rgb^(2.2); a=a
+ XMVECTOR v = *ptr;
+ XMVECTOR v1 = _TableDecodeGamma22( v );
+ *ptr++ = XMVectorSelect( v, v1, g_XMSelect1110 );
+ }
+ }
+ }
+
+ // Handle conversion special cases
+ DWORD diffFlags = in->flags ^ out->flags;
+ if ( diffFlags != 0)
+ {
+ if ( out->flags & CONVF_UNORM )
+ {
+ if ( in->flags & CONVF_SNORM )
+ {
+ // SNORM -> UNORM
+ XMVECTOR* ptr = pBuffer;
+ for( size_t i=0; i < count; ++i )
+ {
+ XMVECTOR v = *ptr;
+ *ptr++ = XMVectorMultiplyAdd( v, g_XMOneHalf, g_XMOneHalf );
+ }
+ }
+ else if ( in->flags & CONVF_FLOAT )
+ {
+ // FLOAT -> UNORM
+ XMVECTOR* ptr = pBuffer;
+ for( size_t i=0; i < count; ++i )
+ {
+ XMVECTOR v = *ptr;
+ *ptr++ = XMVectorSaturate( v );
+ }
+ }
+ }
+ else if ( out->flags & CONVF_SNORM )
+ {
+ if ( in->flags & CONVF_UNORM )
+ {
+ // UNORM -> SNORM
+ static XMVECTORF32 two = { 2.0f, 2.0f, 2.0f, 2.0f };
+ XMVECTOR* ptr = pBuffer;
+ for( size_t i=0; i < count; ++i )
+ {
+ XMVECTOR v = *ptr;
+ *ptr++ = XMVectorMultiplyAdd( v, two, g_XMNegativeOne );
+ }
+ }
+ else if ( in->flags & CONVF_FLOAT )
+ {
+ // FLOAT -> SNORM
+ XMVECTOR* ptr = pBuffer;
+ for( size_t i=0; i < count; ++i )
+ {
+ XMVECTOR v = *ptr;
+ *ptr++ = XMVectorClamp( v, g_XMNegativeOne, g_XMOne );
+ }
+ }
+ }
+
+ // !CONVF_A -> CONVF_A is handled because LoadScanline ensures alpha defaults to 1.0 for no-alpha formats
+
+ // CONVF_PACKED cases are handled because LoadScanline/StoreScanline handles packing/unpacking
+
+ if ( ((out->flags & CONVF_RGBA_MASK) == CONVF_A) && !(in->flags & CONVF_A) )
+ {
+ // !CONVF_A -> A format
+ XMVECTOR* ptr = pBuffer;
+ for( size_t i=0; i < count; ++i )
+ {
+ XMVECTOR v = *ptr;
+ *ptr++ = XMVectorSplatX( v );
+ }
+ }
+ else if ( ((in->flags & CONVF_RGBA_MASK) == CONVF_A) && !(out->flags & CONVF_A) )
+ {
+ // A format -> !CONVF_A
+ XMVECTOR* ptr = pBuffer;
+ for( size_t i=0; i < count; ++i )
+ {
+ XMVECTOR v = *ptr;
+ *ptr++ = XMVectorSplatW( v );
+ }
+ }
+ else if ( ((in->flags & CONVF_RGB_MASK) == CONVF_R) && ((out->flags & CONVF_RGB_MASK) == (CONVF_R|CONVF_G|CONVF_B)) )
+ {
+ // R format -> RGB format
+ XMVECTOR* ptr = pBuffer;
+ for( size_t i=0; i < count; ++i )
+ {
+ XMVECTOR v = *ptr;
+ XMVECTOR v1 = XMVectorSplatX( v );
+ *ptr++ = XMVectorSelect( v, v1, g_XMSelect1110 );
+ }
+ }
+ }
+
+ // sRGB output processing (RGB -> sRGB)
+ if ( flags & TEX_FILTER_SRGB_OUT )
+ {
+ if ( (out->flags & CONVF_FLOAT) || (out->flags & CONVF_UNORM) )
+ {
+ XMVECTOR* ptr = pBuffer;
+ for( size_t i=0; i < count; ++i )
+ {
+ // rgb = rgb^(1/2.2); a=a
+ XMVECTOR v = *ptr;
+ XMVECTOR v1 = _TableEncodeGamma22( v );
+ *ptr++ = XMVectorSelect( v, v1, g_XMSelect1110 );
+ }
+ }
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+// Convert the source image using WIC
+//-------------------------------------------------------------------------------------
+static HRESULT _ConvertUsingWIC( _In_ const Image& srcImage, _In_ const WICPixelFormatGUID& pfGUID,
+ _In_ const WICPixelFormatGUID& targetGUID,
+ _In_ DWORD filter, _In_ float threshold, _In_ const Image& destImage )
+{
+ assert( srcImage.width == destImage.width );
+ assert( srcImage.height == destImage.height );
+
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ ScopedObject<IWICFormatConverter> FC;
+ HRESULT hr = pWIC->CreateFormatConverter( &FC );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Need to implement usage of TEX_FILTER_SRGB_IN/TEX_FILTER_SRGB_OUT
+
+ BOOL canConvert = FALSE;
+ hr = FC->CanConvert( pfGUID, targetGUID, &canConvert );
+ if ( FAILED(hr) || !canConvert )
+ {
+ // This case is not an issue for the subset of WIC formats that map directly to DXGI
+ return E_UNEXPECTED;
+ }
+
+ ScopedObject<IWICBitmap> source;
+ hr = pWIC->CreateBitmapFromMemory( static_cast<UINT>( srcImage.width ), static_cast<UINT>( srcImage.height ), pfGUID,
+ static_cast<UINT>( srcImage.rowPitch ), static_cast<UINT>( srcImage.slicePitch ),
+ srcImage.pixels, &source );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FC->Initialize( source.Get(), targetGUID, _GetWICDither( filter ), 0, threshold, WICBitmapPaletteTypeCustom );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FC->CopyPixels( 0, static_cast<UINT>( destImage.rowPitch ), static_cast<UINT>( destImage.slicePitch ), destImage.pixels );
+ if ( FAILED(hr) )
+ return hr;
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Convert the source using WIC and then convert to DXGI format from there
+//-------------------------------------------------------------------------------------
+static HRESULT _ConvertFromWIC( _In_ const Image& srcImage, _In_ const WICPixelFormatGUID& pfGUID,
+ _In_ DWORD filter, _In_ float threshold, _In_ const Image& destImage )
+{
+ assert( srcImage.width == destImage.width );
+ assert( srcImage.height == destImage.height );
+
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ ScopedObject<IWICFormatConverter> FC;
+ HRESULT hr = pWIC->CreateFormatConverter( &FC );
+ if ( FAILED(hr) )
+ return hr;
+
+ BOOL canConvert = FALSE;
+ hr = FC->CanConvert( pfGUID, GUID_WICPixelFormat128bppRGBAFloat, &canConvert );
+ if ( FAILED(hr) || !canConvert )
+ {
+ // This case is not an issue for the subset of WIC formats that map directly to DXGI
+ return E_UNEXPECTED;
+ }
+
+ ScratchImage temp;
+ hr = temp.Initialize2D( DXGI_FORMAT_R32G32B32A32_FLOAT, srcImage.width, srcImage.height, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *timg = temp.GetImage( 0, 0, 0 );
+ if ( !timg )
+ return E_POINTER;
+
+ ScopedObject<IWICBitmap> source;
+ hr = pWIC->CreateBitmapFromMemory( static_cast<UINT>( srcImage.width ), static_cast<UINT>( srcImage.height ), pfGUID,
+ static_cast<UINT>( srcImage.rowPitch ), static_cast<UINT>( srcImage.slicePitch ),
+ srcImage.pixels, &source );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FC->Initialize( source.Get(), GUID_WICPixelFormat128bppRGBAFloat, _GetWICDither( filter ), 0, threshold, WICBitmapPaletteTypeCustom );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FC->CopyPixels( 0, static_cast<UINT>( timg->rowPitch ), static_cast<UINT>( timg->slicePitch ), timg->pixels );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Perform conversion on temp image which is now in R32G32B32A32_FLOAT format to final image
+ uint8_t *pSrc = timg->pixels;
+ uint8_t *pDest = destImage.pixels;
+ if ( !pSrc || !pDest )
+ return E_POINTER;
+
+ for( size_t h = 0; h < srcImage.height; ++h )
+ {
+ _ConvertScanline( reinterpret_cast<XMVECTOR*>(pSrc), srcImage.width, destImage.format, DXGI_FORMAT_R32G32B32A32_FLOAT, filter );
+
+ if ( !_StoreScanline( pDest, destImage.rowPitch, destImage.format, reinterpret_cast<const XMVECTOR*>(pSrc), srcImage.width ) )
+ return E_FAIL;
+
+ pSrc += timg->rowPitch;
+ pDest += destImage.rowPitch;
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Convert the source from DXGI format then use WIC to convert to final format
+//-------------------------------------------------------------------------------------
+static HRESULT _ConvertToWIC( _In_ const Image& srcImage,
+ _In_ const WICPixelFormatGUID& targetGUID, _In_ DWORD filter, _In_ float threshold, _In_ const Image& destImage )
+{
+ assert( srcImage.width == destImage.width );
+ assert( srcImage.height == destImage.height );
+
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ ScopedObject<IWICFormatConverter> FC;
+ HRESULT hr = pWIC->CreateFormatConverter( &FC );
+ if ( FAILED(hr) )
+ return hr;
+
+ BOOL canConvert = FALSE;
+ hr = FC->CanConvert( GUID_WICPixelFormat128bppRGBAFloat, targetGUID, &canConvert );
+ if ( FAILED(hr) || !canConvert )
+ {
+ // This case is not an issue for the subset of WIC formats that map directly to DXGI
+ return E_UNEXPECTED;
+ }
+
+ ScratchImage temp;
+ hr = temp.Initialize2D( DXGI_FORMAT_R32G32B32A32_FLOAT, srcImage.width, srcImage.height, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *timg = temp.GetImage( 0, 0, 0 );
+ if ( !timg )
+ return E_POINTER;
+
+ const uint8_t *pSrc = srcImage.pixels;
+ if ( !pSrc )
+ return E_POINTER;
+
+ uint8_t *pDest = timg->pixels;
+ if ( !pDest )
+ return E_POINTER;
+
+ for( size_t h = 0; h < srcImage.height; ++h )
+ {
+ if ( !_LoadScanline( reinterpret_cast<XMVECTOR*>(pDest), srcImage.width, pSrc, srcImage.rowPitch, srcImage.format ) )
+ return E_FAIL;
+
+ _ConvertScanline( reinterpret_cast<XMVECTOR*>(pDest), srcImage.width, DXGI_FORMAT_R32G32B32A32_FLOAT, srcImage.format, filter );
+
+ pSrc += srcImage.rowPitch;
+ pDest += timg->rowPitch;
+ }
+
+ // Perform conversion on temp image which is now in R32G32B32A32_FLOAT format
+ ScopedObject<IWICBitmap> source;
+ hr = pWIC->CreateBitmapFromMemory( static_cast<UINT>( timg->width ), static_cast<UINT>( timg->height ), GUID_WICPixelFormat128bppRGBAFloat,
+ static_cast<UINT>( timg->rowPitch ), static_cast<UINT>( timg->slicePitch ),
+ timg->pixels, &source );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FC->Initialize( source.Get(), targetGUID, _GetWICDither( filter ), 0, threshold, WICBitmapPaletteTypeCustom );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FC->CopyPixels( 0, static_cast<UINT>( destImage.rowPitch ), static_cast<UINT>( destImage.slicePitch ), destImage.pixels );
+ if ( FAILED(hr) )
+ return hr;
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Convert the source image (not using WIC)
+//-------------------------------------------------------------------------------------
+static HRESULT _Convert( _In_ const Image& srcImage, _In_ DWORD filter, _In_ const Image& destImage )
+{
+ assert( srcImage.width == destImage.width );
+ assert( srcImage.height == destImage.height );
+
+ ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast<XMVECTOR*>( _aligned_malloc( (sizeof(XMVECTOR)*srcImage.width), 16 ) ) );
+ if ( !scanline )
+ return E_OUTOFMEMORY;
+
+ const uint8_t *pSrc = srcImage.pixels;
+ uint8_t *pDest = destImage.pixels;
+ if ( !pSrc || !pDest )
+ return E_POINTER;
+
+ for( size_t h = 0; h < srcImage.height; ++h )
+ {
+ if ( !_LoadScanline( scanline.get(), srcImage.width, pSrc, srcImage.rowPitch, srcImage.format ) )
+ return E_FAIL;
+
+ _ConvertScanline( scanline.get(), srcImage.width, destImage.format, srcImage.format, filter );
+
+ if ( !_StoreScanline( pDest, destImage.rowPitch, destImage.format, scanline.get(), srcImage.width ) )
+ return E_FAIL;
+
+ pSrc += srcImage.rowPitch;
+ pDest += destImage.rowPitch;
+ }
+
+ return S_OK;
+}
+
+
+//=====================================================================================
+// Entry-points
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// Convert image
+//-------------------------------------------------------------------------------------
+HRESULT Convert( const Image& srcImage, DXGI_FORMAT format, DWORD filter, float threshold, ScratchImage& image )
+{
+ if ( (srcImage.format == format) || !IsValid( format ) )
+ return E_INVALIDARG;
+
+ if ( !srcImage.pixels )
+ return E_POINTER;
+
+ if ( IsCompressed(srcImage.format) || IsCompressed(format)
+ || IsVideo(srcImage.format) || IsVideo(format)
+ || IsTypeless(srcImage.format) || IsTypeless(format) )
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+
+#ifdef _AMD64_
+ if ( (srcImage.width > 0xFFFFFFFF) || (srcImage.height > 0xFFFFFFFF) )
+ return E_INVALIDARG;
+#endif
+
+ HRESULT hr = image.Initialize2D( format, srcImage.width, srcImage.height, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *rimage = image.GetImage( 0, 0, 0 );
+ if ( !rimage )
+ {
+ image.Release();
+ return E_POINTER;
+ }
+
+ WICPixelFormatGUID pfGUID;
+ if ( _DXGIToWIC( srcImage.format, pfGUID ) )
+ {
+ WICPixelFormatGUID targetGUID;
+ if ( _DXGIToWIC( format, targetGUID ) )
+ {
+ // Case 1: Both source and target formats are WIC supported
+ hr = _ConvertUsingWIC( srcImage, pfGUID, targetGUID, filter, threshold, *rimage );
+ }
+ else
+ {
+ // Case 2: Source format is supported by WIC, but not the target format
+ hr = _ConvertFromWIC( srcImage, pfGUID, filter, threshold, *rimage );
+ }
+ }
+ else
+ {
+ WICPixelFormatGUID targetGUID;
+ if ( _DXGIToWIC( format, targetGUID ) )
+ {
+ // Case 3: Source format is not supported by WIC, but does support the target format
+ hr = _ConvertToWIC( srcImage, targetGUID, filter, threshold, *rimage );
+ }
+ else
+ {
+ // Case 4: Both source and target format are not supported by WIC
+ hr = _Convert( srcImage, filter, *rimage );
+ }
+ }
+
+ if ( FAILED(hr) )
+ {
+ image.Release();
+ return hr;
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Convert image (complex)
+//-------------------------------------------------------------------------------------
+HRESULT Convert( const Image* srcImages, size_t nimages, const TexMetadata& metadata,
+ DXGI_FORMAT format, DWORD filter, float threshold, ScratchImage& result )
+{
+ if ( !srcImages || !nimages || (metadata.format == format) || !IsValid(format) )
+ return E_INVALIDARG;
+
+ if ( IsCompressed(metadata.format) || IsCompressed(format)
+ || IsVideo(metadata.format) || IsVideo(format)
+ || IsTypeless(metadata.format) || IsTypeless(format) )
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+
+#ifdef _AMD64_
+ if ( (metadata.width > 0xFFFFFFFF) || (metadata.height > 0xFFFFFFFF) )
+ return E_INVALIDARG;
+#endif
+
+ TexMetadata mdata2 = metadata;
+ mdata2.format = format;
+ HRESULT hr = result.Initialize( mdata2 );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( nimages != result.GetImageCount() )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+
+ const Image* dest = result.GetImages();
+ if ( !dest )
+ {
+ result.Release();
+ return E_POINTER;
+ }
+
+ WICPixelFormatGUID pfGUID, targetGUID;
+ bool wicpf = _DXGIToWIC( metadata.format, pfGUID );
+ bool wictargetpf = _DXGIToWIC( format, targetGUID );
+
+ for( size_t index=0; index < nimages; ++index )
+ {
+ const Image& src = srcImages[ index ];
+ if ( src.format != metadata.format )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+
+#ifdef _AMD64_
+ if ( (src.width > 0xFFFFFFFF) || (src.height > 0xFFFFFFFF) )
+ return E_FAIL;
+#endif
+
+ const Image& dst = dest[ index ];
+ assert( dst.format == format );
+
+ if ( src.width != dst.width || src.height != dst.height )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+
+ if ( wicpf )
+ {
+ if ( wictargetpf )
+ {
+ // Case 1: Both source and target formats are WIC supported
+ hr = _ConvertUsingWIC( src, pfGUID, targetGUID, filter, threshold, dst );
+ }
+ else
+ {
+ // Case 2: Source format is supported by WIC, but not the target format
+ hr = _ConvertFromWIC( src, pfGUID, filter, threshold, dst );
+ }
+ }
+ else
+ {
+ if ( wictargetpf )
+ {
+ // Case 3: Source format is not supported by WIC, but does support the target format
+ hr = _ConvertToWIC( src, targetGUID, filter, threshold, dst );
+ }
+ else
+ {
+ // Case 4: Both source and target format are not supported by WIC
+ hr = _Convert( src, filter, dst );
+ }
+ }
+
+ if ( FAILED(hr) )
+ {
+ result.Release();
+ return hr;
+ }
+ }
+
+ return S_OK;
+}
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexD3D11.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexD3D11.cpp
new file mode 100644
index 00000000..e640dfd0
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTexD3D11.cpp
@@ -0,0 +1,820 @@
+//-------------------------------------------------------------------------------------
+// DirectXTexD3D11.cpp
+//
+// DirectX Texture Library - Direct3D 11 helpers
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+#include <d3d10.h>
+
+namespace DirectX
+{
+
+static HRESULT _Capture( _In_ ID3D11DeviceContext* pContext, _In_ ID3D11Resource* pSource, _In_ const TexMetadata& metadata,
+ _In_ const ScratchImage& result )
+{
+ if ( !pContext || !pSource || !result.GetPixels() )
+ return E_POINTER;
+
+ if ( metadata.dimension == TEX_DIMENSION_TEXTURE3D )
+ {
+ //--- Volume texture ----------------------------------------------------------
+ assert( metadata.arraySize == 1 );
+
+ size_t height = metadata.height;
+ size_t depth = metadata.depth;
+
+ for( size_t level = 0; level < metadata.mipLevels; ++level )
+ {
+ UINT dindex = D3D11CalcSubresource( static_cast<UINT>( level ), 0, static_cast<UINT>( metadata.mipLevels ) );
+
+ D3D11_MAPPED_SUBRESOURCE mapped;
+ HRESULT hr = pContext->Map( pSource, dindex, D3D11_MAP_READ, 0, &mapped );
+ if ( FAILED(hr) )
+ return hr;
+
+ const uint8_t* pslice = reinterpret_cast<const uint8_t*>( mapped.pData );
+ if ( !pslice )
+ {
+ pContext->Unmap( pSource, dindex );
+ return E_POINTER;
+ }
+
+ size_t lines = ComputeScanlines( metadata.format, height );
+
+ for( size_t slice = 0; slice < depth; ++slice )
+ {
+ const Image* img = result.GetImage( level, 0, slice );
+ if ( !img )
+ {
+ pContext->Unmap( pSource, dindex );
+ return E_FAIL;
+ }
+
+ if ( !img->pixels )
+ {
+ pContext->Unmap( pSource, dindex );
+ return E_POINTER;
+ }
+
+ const uint8_t* sptr = pslice;
+ uint8_t* dptr = img->pixels;
+ for( size_t h = 0; h < lines; ++h )
+ {
+ size_t msize = std::min<size_t>( img->rowPitch, mapped.RowPitch );
+ memcpy_s( dptr, img->rowPitch, sptr, msize );
+ sptr += mapped.RowPitch;
+ dptr += img->rowPitch;
+ }
+
+ pslice += mapped.DepthPitch;
+ }
+
+ pContext->Unmap( pSource, dindex );
+
+ if ( height > 1 )
+ height >>= 1;
+ if ( depth > 1 )
+ depth >>= 1;
+ }
+ }
+ else
+ {
+ //--- 1D or 2D texture --------------------------------------------------------
+ assert( metadata.depth == 1 );
+
+ for( size_t item = 0; item < metadata.arraySize; ++item )
+ {
+ size_t height = metadata.height;
+
+ for( size_t level = 0; level < metadata.mipLevels; ++level )
+ {
+ UINT dindex = D3D11CalcSubresource( static_cast<UINT>( level ), static_cast<UINT>( item ), static_cast<UINT>( metadata.mipLevels ) );
+
+ D3D11_MAPPED_SUBRESOURCE mapped;
+ HRESULT hr = pContext->Map( pSource, dindex, D3D11_MAP_READ, 0, &mapped );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image* img = result.GetImage( level, item, 0 );
+ if ( !img )
+ {
+ pContext->Unmap( pSource, dindex );
+ return E_FAIL;
+ }
+
+ if ( !img->pixels )
+ {
+ pContext->Unmap( pSource, dindex );
+ return E_POINTER;
+ }
+
+ size_t lines = ComputeScanlines( metadata.format, height );
+
+ const uint8_t* sptr = reinterpret_cast<const uint8_t*>( mapped.pData );
+ uint8_t* dptr = img->pixels;
+ for( size_t h = 0; h < lines; ++h )
+ {
+ size_t msize = std::min<size_t>( img->rowPitch, mapped.RowPitch );
+ memcpy_s( dptr, img->rowPitch, sptr, msize );
+ sptr += mapped.RowPitch;
+ dptr += img->rowPitch;
+ }
+
+ pContext->Unmap( pSource, dindex );
+
+ if ( height > 1 )
+ height >>= 1;
+ }
+ }
+ }
+
+ return S_OK;
+}
+
+
+//=====================================================================================
+// Entry-points
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// Determine if given texture metadata is supported on the given device
+//-------------------------------------------------------------------------------------
+bool IsSupportedTexture( ID3D11Device* pDevice, const TexMetadata& metadata )
+{
+ if ( !pDevice )
+ return false;
+
+ D3D_FEATURE_LEVEL fl = pDevice->GetFeatureLevel();
+
+ // Validate format
+ DXGI_FORMAT fmt = metadata.format;
+
+ if ( !IsValid( fmt ) )
+ return false;
+
+ if ( IsVideo(fmt) )
+ return false;
+
+ switch( fmt )
+ {
+ case DXGI_FORMAT_BC4_TYPELESS:
+ case DXGI_FORMAT_BC4_UNORM:
+ case DXGI_FORMAT_BC4_SNORM:
+ case DXGI_FORMAT_BC5_TYPELESS:
+ case DXGI_FORMAT_BC5_UNORM:
+ case DXGI_FORMAT_BC5_SNORM:
+ if ( fl < D3D_FEATURE_LEVEL_10_0 )
+ return false;
+ break;
+
+ case DXGI_FORMAT_BC6H_TYPELESS:
+ case DXGI_FORMAT_BC6H_UF16:
+ case DXGI_FORMAT_BC6H_SF16:
+ case DXGI_FORMAT_BC7_TYPELESS:
+ case DXGI_FORMAT_BC7_UNORM:
+ case DXGI_FORMAT_BC7_UNORM_SRGB:
+ if ( fl < D3D_FEATURE_LEVEL_11_0 )
+ return false;
+ break;
+ }
+
+ // Validate miplevel count
+ if ( metadata.mipLevels > D3D11_REQ_MIP_LEVELS )
+ return false;
+
+ // Validate array size, dimension, and width/height
+ size_t arraySize = metadata.arraySize;
+ size_t iWidth = metadata.width;
+ size_t iHeight = metadata.height;
+ size_t iDepth = metadata.depth;
+
+ // Most cases are known apriori based on feature level, but we use this for robustness to handle the few optional cases
+ UINT formatSupport = 0;
+ pDevice->CheckFormatSupport( fmt, &formatSupport );
+
+ switch ( metadata.dimension )
+ {
+ case TEX_DIMENSION_TEXTURE1D:
+ if ( !(formatSupport & D3D11_FORMAT_SUPPORT_TEXTURE1D) )
+ return false;
+
+ if ( (arraySize > D3D11_REQ_TEXTURE1D_ARRAY_AXIS_DIMENSION)
+ || (iWidth > D3D11_REQ_TEXTURE1D_U_DIMENSION) )
+ return false;
+
+ if ( fl < D3D_FEATURE_LEVEL_11_0 )
+ {
+ if ( (arraySize > D3D10_REQ_TEXTURE1D_ARRAY_AXIS_DIMENSION)
+ || (iWidth > D3D10_REQ_TEXTURE1D_U_DIMENSION) )
+ return false;
+
+ if ( fl < D3D_FEATURE_LEVEL_10_0 )
+ {
+ if ( (arraySize > 1) || (iWidth > 4096 /*D3D_FL9_3_REQ_TEXTURE1D_U_DIMENSION*/) )
+ return false;
+
+ if ( (fl < D3D_FEATURE_LEVEL_9_3) && (iWidth > 2048 /*D3D_FL9_1_REQ_TEXTURE1D_U_DIMENSION*/ ) )
+ return false;
+ }
+ }
+ break;
+
+ case TEX_DIMENSION_TEXTURE2D:
+ if ( metadata.miscFlags & TEX_MISC_TEXTURECUBE )
+ {
+ if ( !(formatSupport & D3D11_FORMAT_SUPPORT_TEXTURECUBE) )
+ return false;
+
+ if ( (arraySize > D3D11_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION)
+ || (iWidth > D3D11_REQ_TEXTURECUBE_DIMENSION)
+ || (iHeight > D3D11_REQ_TEXTURECUBE_DIMENSION))
+ return false;
+
+ if ( fl < D3D_FEATURE_LEVEL_11_0 )
+ {
+ if ( (arraySize > D3D10_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION)
+ || (iWidth > D3D10_REQ_TEXTURECUBE_DIMENSION)
+ || (iHeight > D3D10_REQ_TEXTURECUBE_DIMENSION))
+ return false;
+
+ if ( (fl < D3D_FEATURE_LEVEL_10_1) && (arraySize != 6) )
+ return false;
+
+ if ( fl < D3D_FEATURE_LEVEL_10_0 )
+ {
+ if ( (iWidth > 4096 /*D3D_FL9_3_REQ_TEXTURECUBE_DIMENSION*/ )
+ || (iHeight > 4096 /*D3D_FL9_3_REQ_TEXTURECUBE_DIMENSION*/ ) )
+ return false;
+
+ if ( (fl < D3D_FEATURE_LEVEL_9_3)
+ && ( (iWidth > 512 /*D3D_FL9_1_REQ_TEXTURECUBE_DIMENSION*/)
+ || (iHeight > 512 /*D3D_FL9_1_REQ_TEXTURECUBE_DIMENSION*/) ) )
+ return false;
+ }
+ }
+ }
+ else // Not a cube map
+ {
+ if ( !(formatSupport & D3D11_FORMAT_SUPPORT_TEXTURE2D) )
+ return false;
+
+ if ( (arraySize > D3D11_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION)
+ || (iWidth > D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION)
+ || (iHeight > D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION))
+ return false;
+
+ if ( fl < D3D_FEATURE_LEVEL_11_0 )
+ {
+ if ( (arraySize > D3D10_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION)
+ || (iWidth > D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION)
+ || (iHeight > D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION))
+ return false;
+
+ if ( fl < D3D_FEATURE_LEVEL_10_0 )
+ {
+ if ( (arraySize > 1)
+ || (iWidth > 4096 /*D3D_FL9_3_REQ_TEXTURE2D_U_OR_V_DIMENSION*/)
+ || (iHeight > 4096 /*D3D_FL9_3_REQ_TEXTURE2D_U_OR_V_DIMENSION*/) )
+ return false;
+
+ if ( (fl < D3D_FEATURE_LEVEL_9_3)
+ && ( (iWidth > 2048 /*D3D_FL9_1_REQ_TEXTURE2D_U_OR_V_DIMENSION*/)
+ || (iHeight > 2048 /*D3D_FL9_1_REQ_TEXTURE2D_U_OR_V_DIMENSION*/) ) )
+ return false;
+ }
+ }
+ }
+ break;
+
+ case TEX_DIMENSION_TEXTURE3D:
+ if ( !(formatSupport & D3D11_FORMAT_SUPPORT_TEXTURE3D) )
+ return false;
+
+ if ( (arraySize > 1)
+ || (iWidth > D3D11_REQ_TEXTURE3D_U_V_OR_W_DIMENSION)
+ || (iHeight > D3D11_REQ_TEXTURE3D_U_V_OR_W_DIMENSION)
+ || (iDepth > D3D11_REQ_TEXTURE3D_U_V_OR_W_DIMENSION) )
+ return false;
+
+ if ( fl < D3D_FEATURE_LEVEL_11_0 )
+ {
+ if ( (iWidth > D3D10_REQ_TEXTURE3D_U_V_OR_W_DIMENSION)
+ || (iHeight > D3D10_REQ_TEXTURE3D_U_V_OR_W_DIMENSION)
+ || (iDepth > D3D10_REQ_TEXTURE3D_U_V_OR_W_DIMENSION) )
+ return false;
+
+ if ( fl < D3D_FEATURE_LEVEL_10_0 )
+ {
+ if ( (iWidth > 256 /*D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION*/)
+ || (iHeight > 256 /*D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION*/)
+ || (iDepth > 256 /*D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION*/) )
+ return false;
+ }
+ }
+ break;
+
+ default:
+ // Not a supported dimension
+ return false;
+ }
+
+ return true;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Create a texture resource
+//-------------------------------------------------------------------------------------
+HRESULT CreateTexture( ID3D11Device* pDevice, const Image* srcImages, size_t nimages, const TexMetadata& metadata,
+ ID3D11Resource** ppResource )
+{
+ if ( !pDevice || !srcImages || !nimages || !ppResource )
+ return E_INVALIDARG;
+
+ if ( !metadata.mipLevels || !metadata.arraySize )
+ return E_INVALIDARG;
+
+#ifdef _AMD64_
+ if ( (metadata.width > 0xFFFFFFFF) || (metadata.height > 0xFFFFFFFF)
+ || (metadata.mipLevels > 0xFFFFFFFF) || (metadata.arraySize > 0xFFFFFFFF) )
+ return E_INVALIDARG;
+#endif
+
+ std::unique_ptr<D3D11_SUBRESOURCE_DATA[]> initData( new D3D11_SUBRESOURCE_DATA[ metadata.mipLevels * metadata.arraySize ] );
+ if ( !initData )
+ return E_OUTOFMEMORY;
+
+ // Fill out subresource array
+ if ( metadata.dimension == TEX_DIMENSION_TEXTURE3D )
+ {
+ //--- Volume case -------------------------------------------------------------
+ if ( !metadata.depth )
+ return E_INVALIDARG;
+
+#ifdef _AMD64_
+ if ( metadata.depth > 0xFFFFFFFF )
+ return E_INVALIDARG;
+#endif
+
+ if ( metadata.arraySize > 1 )
+ // Direct3D 11 doesn't support arrays of 3D textures
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+
+ size_t depth = metadata.depth;
+
+ size_t idx = 0;
+ for( size_t level = 0; level < metadata.mipLevels; ++level )
+ {
+ size_t index = metadata.ComputeIndex( level, 0, 0 );
+ if ( index >= nimages )
+ return E_FAIL;
+
+ const Image& img = srcImages[ index ];
+
+ if ( img.format != metadata.format )
+ return E_FAIL;
+
+ if ( !img.pixels )
+ return E_POINTER;
+
+ // Verify pixels in image 1 .. (depth-1) are exactly image->slicePitch apart
+ // For 3D textures, this relies on all slices of the same miplevel being continous in memory
+ // (this is how ScratchImage lays them out), which is why we just give the 0th slice to Direct3D 11
+ const uint8_t* pSlice = img.pixels + img.slicePitch;
+ for( size_t slice = 1; slice < depth; ++slice )
+ {
+ size_t tindex = metadata.ComputeIndex( level, 0, slice );
+ if ( tindex >= nimages )
+ return E_FAIL;
+
+ const Image& timg = srcImages[ tindex ];
+
+ if ( !timg.pixels )
+ return E_POINTER;
+
+ if ( timg.pixels != pSlice
+ || timg.format != metadata.format
+ || timg.rowPitch != img.rowPitch
+ || timg.slicePitch != img.slicePitch )
+ return E_FAIL;
+
+ pSlice = timg.pixels + img.slicePitch;
+ }
+
+ assert( idx < (metadata.mipLevels * metadata.arraySize) );
+
+ initData[idx].pSysMem = img.pixels;
+ initData[idx].SysMemPitch = static_cast<DWORD>( img.rowPitch );
+ initData[idx].SysMemSlicePitch = static_cast<DWORD>( img.slicePitch );
+ ++idx;
+
+ if ( depth > 1 )
+ depth >>= 1;
+ }
+ }
+ else
+ {
+ //--- 1D or 2D texture case ---------------------------------------------------
+ size_t idx = 0;
+ for( size_t item = 0; item < metadata.arraySize; ++item )
+ {
+ for( size_t level = 0; level < metadata.mipLevels; ++level )
+ {
+ size_t index = metadata.ComputeIndex( level, item, 0 );
+ if ( index >= nimages )
+ return E_FAIL;
+
+ const Image& img = srcImages[ index ];
+
+ if ( img.format != metadata.format )
+ return E_FAIL;
+
+ if ( !img.pixels )
+ return E_POINTER;
+
+ assert( idx < (metadata.mipLevels * metadata.arraySize) );
+
+ initData[idx].pSysMem = img.pixels;
+ initData[idx].SysMemPitch = static_cast<DWORD>( img.rowPitch );
+ initData[idx].SysMemSlicePitch = static_cast<DWORD>( img.slicePitch );
+ ++idx;
+ }
+ }
+ }
+
+ // Create texture using static initialization data
+ HRESULT hr = E_FAIL;
+
+ switch ( metadata.dimension )
+ {
+ case TEX_DIMENSION_TEXTURE1D:
+ {
+ D3D11_TEXTURE1D_DESC desc;
+ desc.Width = static_cast<UINT>( metadata.width );
+ desc.MipLevels = static_cast<UINT>( metadata.mipLevels );
+ desc.ArraySize = static_cast<UINT>( metadata.arraySize );
+ desc.Format = metadata.format;
+ desc.Usage = D3D11_USAGE_DEFAULT;
+ desc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
+ desc.CPUAccessFlags = 0;
+ desc.MiscFlags = 0;
+
+ hr = pDevice->CreateTexture1D( &desc, initData.get(), reinterpret_cast<ID3D11Texture1D**>(ppResource) );
+ }
+ break;
+
+ case TEX_DIMENSION_TEXTURE2D:
+ {
+ D3D11_TEXTURE2D_DESC desc;
+ desc.Width = static_cast<UINT>( metadata.width );
+ desc.Height = static_cast<UINT>( metadata.height );
+ desc.MipLevels = static_cast<UINT>( metadata.mipLevels );
+ desc.ArraySize = static_cast<UINT>( metadata.arraySize );
+ desc.Format = metadata.format;
+ desc.SampleDesc.Count = 1;
+ desc.SampleDesc.Quality = 0;
+ desc.Usage = D3D11_USAGE_DEFAULT;
+ desc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
+ desc.CPUAccessFlags = 0;
+ desc.MiscFlags = (metadata.miscFlags & TEX_MISC_TEXTURECUBE) ? D3D11_RESOURCE_MISC_TEXTURECUBE : 0;
+
+ hr = pDevice->CreateTexture2D( &desc, initData.get(), reinterpret_cast<ID3D11Texture2D**>(ppResource) );
+ }
+ break;
+
+ case TEX_DIMENSION_TEXTURE3D:
+ {
+ D3D11_TEXTURE3D_DESC desc;
+ desc.Width = static_cast<UINT>( metadata.width );
+ desc.Height = static_cast<UINT>( metadata.height );
+ desc.Depth = static_cast<UINT>( metadata.depth );
+ desc.MipLevels = static_cast<UINT>( metadata.mipLevels );
+ desc.Format = metadata.format;
+ desc.Usage = D3D11_USAGE_DEFAULT;
+ desc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
+ desc.CPUAccessFlags = 0;
+ desc.MiscFlags = 0;
+
+ hr = pDevice->CreateTexture3D( &desc, initData.get(), reinterpret_cast<ID3D11Texture3D**>(ppResource) );
+ }
+ break;
+ }
+
+ return hr;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Create a shader resource view and associated texture
+//-------------------------------------------------------------------------------------
+HRESULT CreateShaderResourceView( ID3D11Device* pDevice, const Image* srcImages, size_t nimages, const TexMetadata& metadata,
+ ID3D11ShaderResourceView** ppSRV )
+{
+ if ( !ppSRV )
+ return E_INVALIDARG;
+
+ ScopedObject<ID3D11Resource> resource;
+ HRESULT hr = CreateTexture( pDevice, srcImages, nimages, metadata, &resource );
+ if ( FAILED(hr) )
+ return hr;
+
+ assert( !resource.IsNull() );
+
+ D3D11_SHADER_RESOURCE_VIEW_DESC SRVDesc;
+ memset( &SRVDesc, 0, sizeof(SRVDesc) );
+ SRVDesc.Format = metadata.format;
+
+ switch ( metadata.dimension )
+ {
+ case TEX_DIMENSION_TEXTURE1D:
+ if ( metadata.arraySize > 1 )
+ {
+ SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1DARRAY;
+ SRVDesc.Texture1DArray.MipLevels = static_cast<UINT>( metadata.mipLevels );
+ SRVDesc.Texture1DArray.ArraySize = static_cast<UINT>( metadata.arraySize );
+ }
+ else
+ {
+ SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1D;
+ SRVDesc.Texture1D.MipLevels = static_cast<UINT>( metadata.mipLevels );
+ }
+ break;
+
+ case TEX_DIMENSION_TEXTURE2D:
+ if ( metadata.miscFlags & TEX_MISC_TEXTURECUBE )
+ {
+ if (metadata.arraySize > 6)
+ {
+ assert( (metadata.arraySize % 6) == 0 );
+ SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBEARRAY;
+ SRVDesc.TextureCubeArray.MipLevels = static_cast<UINT>( metadata.mipLevels );
+ SRVDesc.TextureCubeArray.NumCubes = static_cast<UINT>( metadata.arraySize / 6 );
+ }
+ else
+ {
+ SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBE;
+ SRVDesc.TextureCube.MipLevels = static_cast<UINT>( metadata.mipLevels );
+ }
+ }
+ else if ( metadata.arraySize > 1 )
+ {
+ SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2DARRAY;
+ SRVDesc.Texture2DArray.MipLevels = static_cast<UINT>( metadata.mipLevels );
+ SRVDesc.Texture2DArray.ArraySize = static_cast<UINT>( metadata.arraySize );
+ }
+ else
+ {
+ SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
+ SRVDesc.Texture2D.MipLevels = static_cast<UINT>( metadata.mipLevels );
+ }
+ break;
+
+ case TEX_DIMENSION_TEXTURE3D:
+ assert( metadata.arraySize == 1 );
+ SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE3D;
+ SRVDesc.Texture3D.MipLevels = static_cast<UINT>( metadata.mipLevels );
+ break;
+
+ default:
+ return E_FAIL;
+ }
+
+ hr = pDevice->CreateShaderResourceView( resource.Get(), &SRVDesc, ppSRV );
+ if ( FAILED(hr) )
+ return hr;
+
+ assert( *ppSRV );
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Save a texture resource to a DDS file in memory/on disk
+//-------------------------------------------------------------------------------------
+HRESULT CaptureTexture( ID3D11Device* pDevice, ID3D11DeviceContext* pContext, ID3D11Resource* pSource, ScratchImage& result )
+{
+ if ( !pDevice || !pContext || !pSource )
+ return E_INVALIDARG;
+
+ D3D11_RESOURCE_DIMENSION resType = D3D11_RESOURCE_DIMENSION_UNKNOWN;
+ pSource->GetType( &resType );
+
+ HRESULT hr;
+
+ switch( resType )
+ {
+ case D3D11_RESOURCE_DIMENSION_TEXTURE1D:
+ {
+ ScopedObject<ID3D11Texture1D> pTexture;
+ hr = pSource->QueryInterface( __uuidof(ID3D11Texture1D), (void**) &pTexture );
+ if ( FAILED(hr) )
+ break;
+
+ assert( pTexture.Get() );
+
+ D3D11_TEXTURE1D_DESC desc;
+ pTexture->GetDesc( &desc );
+
+ desc.BindFlags = 0;
+ desc.MiscFlags = 0;
+ desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+ desc.Usage = D3D11_USAGE_STAGING;
+
+ ScopedObject<ID3D11Texture1D> pStaging;
+ hr = pDevice->CreateTexture1D( &desc, 0, &pStaging );
+ if ( FAILED(hr) )
+ break;
+
+ assert( pStaging.Get() );
+
+ pContext->CopyResource( pStaging.Get(), pSource );
+
+ TexMetadata mdata;
+ mdata.width = desc.Width;
+ mdata.height = mdata.depth = 1;
+ mdata.arraySize = desc.ArraySize;
+ mdata.mipLevels = desc.MipLevels;
+ mdata.miscFlags = 0;
+ mdata.format = desc.Format;
+ mdata.dimension = TEX_DIMENSION_TEXTURE1D;
+
+ hr = result.Initialize( mdata );
+ if ( FAILED(hr) )
+ break;
+
+ hr = _Capture( pContext, pStaging.Get(), mdata, result );
+ }
+ break;
+
+ case D3D11_RESOURCE_DIMENSION_TEXTURE2D:
+ {
+ ScopedObject<ID3D11Texture2D> pTexture;
+ hr = pSource->QueryInterface( __uuidof(ID3D11Texture2D), (void**) &pTexture );
+ if ( FAILED(hr) )
+ break;
+
+ assert( pTexture.Get() );
+
+ D3D11_TEXTURE2D_DESC desc;
+ pTexture->GetDesc( &desc );
+
+ ScopedObject<ID3D11Texture2D> pStaging;
+ if ( desc.SampleDesc.Count > 1 )
+ {
+ desc.SampleDesc.Count = 1;
+ desc.SampleDesc.Quality = 0;
+
+ ScopedObject<ID3D11Texture2D> pTemp;
+ hr = pDevice->CreateTexture2D( &desc, 0, &pTemp );
+ if ( FAILED(hr) )
+ break;
+
+ assert( pTemp.Get() );
+
+ DXGI_FORMAT fmt = desc.Format;
+ if ( IsTypeless(fmt) )
+ {
+ // Assume a UNORM if it exists otherwise use FLOAT
+ fmt = MakeTypelessUNORM( fmt );
+ fmt = MakeTypelessFLOAT( fmt );
+ }
+
+ UINT support = 0;
+ hr = pDevice->CheckFormatSupport( fmt, &support );
+ if ( FAILED(hr) )
+ break;
+
+ if ( !(support & D3D11_FORMAT_SUPPORT_MULTISAMPLE_RESOLVE) )
+ {
+ hr = E_FAIL;
+ break;
+ }
+
+ for( UINT item = 0; item < desc.ArraySize; ++item )
+ {
+ for( UINT level = 0; level < desc.MipLevels; ++level )
+ {
+ UINT index = D3D11CalcSubresource( level, item, desc.MipLevels );
+
+ pContext->ResolveSubresource( pTemp.Get(), index, pSource, index, fmt );
+ }
+ }
+
+ desc.BindFlags = 0;
+ desc.MiscFlags &= D3D11_RESOURCE_MISC_TEXTURECUBE;
+ desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+ desc.Usage = D3D11_USAGE_STAGING;
+
+ hr = pDevice->CreateTexture2D( &desc, 0, &pStaging );
+ if ( FAILED(hr) )
+ break;
+
+ assert( pStaging.Get() );
+
+ pContext->CopyResource( pStaging.Get(), pTemp.Get() );
+ }
+ else
+ {
+ desc.BindFlags = 0;
+ desc.MiscFlags &= D3D11_RESOURCE_MISC_TEXTURECUBE;
+ desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+ desc.Usage = D3D11_USAGE_STAGING;
+
+ hr = pDevice->CreateTexture2D( &desc, 0, &pStaging );
+ if ( FAILED(hr) )
+ break;
+
+ assert( pStaging.Get() );
+
+ pContext->CopyResource( pStaging.Get(), pSource );
+ }
+
+ TexMetadata mdata;
+ mdata.width = desc.Width;
+ mdata.height = desc.Height;
+ mdata.depth = 1;
+ mdata.arraySize = desc.ArraySize;
+ mdata.mipLevels = desc.MipLevels;
+ mdata.miscFlags = (desc.MiscFlags & D3D11_RESOURCE_MISC_TEXTURECUBE) ? TEX_MISC_TEXTURECUBE : 0;
+ mdata.format = desc.Format;
+ mdata.dimension = TEX_DIMENSION_TEXTURE2D;
+
+ hr = result.Initialize( mdata );
+ if ( FAILED(hr) )
+ break;
+
+ hr = _Capture( pContext, pStaging.Get(), mdata, result );
+ }
+ break;
+
+ case D3D11_RESOURCE_DIMENSION_TEXTURE3D:
+ {
+ ScopedObject<ID3D11Texture3D> pTexture;
+ hr = pSource->QueryInterface( __uuidof(ID3D11Texture3D), (void**) &pTexture );
+ if ( FAILED(hr) )
+ break;
+
+ assert( pTexture.Get() );
+
+ D3D11_TEXTURE3D_DESC desc;
+ pTexture->GetDesc( &desc );
+
+ desc.BindFlags = 0;
+ desc.MiscFlags = 0;
+ desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+ desc.Usage = D3D11_USAGE_STAGING;
+
+ ScopedObject<ID3D11Texture3D> pStaging;
+ hr = pDevice->CreateTexture3D( &desc, 0, &pStaging );
+ if ( FAILED(hr) )
+ break;
+
+ assert( pStaging.Get() );
+
+ pContext->CopyResource( pStaging.Get(), pSource );
+
+ TexMetadata mdata;
+ mdata.width = desc.Width;
+ mdata.height = desc.Height;
+ mdata.depth = desc.Depth;
+ mdata.arraySize = 1;
+ mdata.mipLevels = desc.MipLevels;
+ mdata.miscFlags = 0;
+ mdata.format = desc.Format;
+ mdata.dimension = TEX_DIMENSION_TEXTURE3D;
+
+ hr = result.Initialize( mdata );
+ if ( FAILED(hr) )
+ break;
+
+ hr = _Capture( pContext, pStaging.Get(), mdata, result );
+ }
+ break;
+
+ default:
+ hr = E_FAIL;
+ break;
+ }
+
+ if ( FAILED(hr) )
+ {
+ result.Release();
+ return hr;
+ }
+
+ return S_OK;
+}
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexDDS.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexDDS.cpp
new file mode 100644
index 00000000..d249b4fe
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTexDDS.cpp
@@ -0,0 +1,1684 @@
+//-------------------------------------------------------------------------------------
+// DirectXTexDDS.cpp
+//
+// DirectX Texture Library - Microsoft DirectDraw Surface (DDS) file format reader/writer
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+#include "DDS.h"
+
+namespace DirectX
+{
+
+//-------------------------------------------------------------------------------------
+// Legacy format mapping table (used for DDS files without 'DX10' extended header)
+//-------------------------------------------------------------------------------------
+enum CONVERSION_FLAGS
+{
+ CONV_FLAGS_NONE = 0x0,
+ CONV_FLAGS_EXPAND = 0x1, // Conversion requires expanded pixel size
+ CONV_FLAGS_NOALPHA = 0x2, // Conversion requires setting alpha to known value
+ CONV_FLAGS_SWIZZLE = 0x4, // BGR/RGB order swizzling required
+ CONV_FLAGS_PAL8 = 0x8, // Has an 8-bit palette
+ CONV_FLAGS_888 = 0x10, // Source is an 8:8:8 (24bpp) format
+ CONV_FLAGS_565 = 0x20, // Source is a 5:6:5 (16bpp) format
+ CONV_FLAGS_5551 = 0x40, // Source is a 5:5:5:1 (16bpp) format
+ CONV_FLAGS_4444 = 0x80, // Source is a 4:4:4:4 (16bpp) format
+ CONV_FLAGS_44 = 0x100, // Source is a 4:4 (8bpp) format
+ CONV_FLAGS_332 = 0x200, // Source is a 3:3:2 (8bpp) format
+ CONV_FLAGS_8332 = 0x400, // Source is a 8:3:3:2 (16bpp) format
+ CONV_FLAGS_A8P8 = 0x800, // Has an 8-bit palette with an alpha channel
+ CONV_FLAGS_DX10 = 0x10000, // Has the 'DX10' extension header
+};
+
+struct LegacyDDS
+{
+ DXGI_FORMAT format;
+ DWORD convFlags;
+ DDS_PIXELFORMAT ddpf;
+};
+
+const LegacyDDS g_LegacyDDSMap[] =
+{
+ { DXGI_FORMAT_BC1_UNORM, CONV_FLAGS_NONE, DDSPF_DXT1 }, // D3DFMT_DXT1
+ { DXGI_FORMAT_BC2_UNORM, CONV_FLAGS_NONE, DDSPF_DXT3 }, // D3DFMT_DXT3
+ { DXGI_FORMAT_BC3_UNORM, CONV_FLAGS_NONE, DDSPF_DXT5 }, // D3DFMT_DXT5
+
+ { DXGI_FORMAT_BC2_UNORM, CONV_FLAGS_NONE, DDSPF_DXT2 }, // D3DFMT_DXT2 (ignore premultiply)
+ { DXGI_FORMAT_BC3_UNORM, CONV_FLAGS_NONE, DDSPF_DXT4 }, // D3DFMT_DXT4 (ignore premultiply)
+
+ { DXGI_FORMAT_BC4_UNORM, CONV_FLAGS_NONE, DDSPF_BC4_UNORM },
+ { DXGI_FORMAT_BC4_SNORM, CONV_FLAGS_NONE, DDSPF_BC4_SNORM },
+ { DXGI_FORMAT_BC5_UNORM, CONV_FLAGS_NONE, DDSPF_BC5_UNORM },
+ { DXGI_FORMAT_BC5_SNORM, CONV_FLAGS_NONE, DDSPF_BC5_SNORM },
+
+ { DXGI_FORMAT_BC4_UNORM, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC( 'A', 'T', 'I', '1' ), 0, 0, 0, 0, 0 } },
+ { DXGI_FORMAT_BC5_UNORM, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, MAKEFOURCC( 'A', 'T', 'I', '2' ), 0, 0, 0, 0, 0 } },
+
+ { DXGI_FORMAT_R8G8_B8G8_UNORM, CONV_FLAGS_NONE, DDSPF_R8G8_B8G8 }, // D3DFMT_R8G8_B8G8
+ { DXGI_FORMAT_G8R8_G8B8_UNORM, CONV_FLAGS_NONE, DDSPF_G8R8_G8B8 }, // D3DFMT_G8R8_G8B8
+
+ { DXGI_FORMAT_B8G8R8A8_UNORM, CONV_FLAGS_NONE, DDSPF_A8R8G8B8 }, // D3DFMT_A8R8G8B8 (uses DXGI 1.1 format)
+ { DXGI_FORMAT_B8G8R8X8_UNORM, CONV_FLAGS_NONE, DDSPF_X8R8G8B8 }, // D3DFMT_X8R8G8B8 (uses DXGI 1.1 format)
+ { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_NONE, DDSPF_A8B8G8R8 }, // D3DFMT_A8B8G8R8
+ { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_NOALPHA, DDSPF_X8B8G8R8 }, // D3DFMT_X8B8G8R8
+ { DXGI_FORMAT_R16G16_UNORM, CONV_FLAGS_NONE, DDSPF_G16R16 }, // D3DFMT_G16R16
+
+ { DXGI_FORMAT_R10G10B10A2_UNORM, CONV_FLAGS_SWIZZLE, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 32, 0x000003ff, 0x000ffc00, 0x3ff00000, 0xc0000000 } }, // D3DFMT_A2R10G10B10 (D3DX reversal issue workaround)
+ { DXGI_FORMAT_R10G10B10A2_UNORM, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 32, 0x3ff00000, 0x000ffc00, 0x000003ff, 0xc0000000 } }, // D3DFMT_A2B10G10R10 (D3DX reversal issue workaround)
+
+ { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_EXPAND
+ | CONV_FLAGS_NOALPHA
+ | CONV_FLAGS_888, DDSPF_R8G8B8 }, // D3DFMT_R8G8B8
+
+ { DXGI_FORMAT_B5G6R5_UNORM, CONV_FLAGS_565, DDSPF_R5G6B5 }, // D3DFMT_R5G6B5
+ { DXGI_FORMAT_B5G5R5A1_UNORM, CONV_FLAGS_5551, DDSPF_A1R5G5B5 }, // D3DFMT_A1R5G5B5
+ { DXGI_FORMAT_B5G5R5A1_UNORM, CONV_FLAGS_5551
+ | CONV_FLAGS_NOALPHA, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 16, 0x7c00, 0x03e0, 0x001f, 0x0000 } }, // D3DFMT_X1R5G5B5
+
+ { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_EXPAND
+ | CONV_FLAGS_8332, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 16, 0x00e0, 0x001c, 0x0003, 0xff00 } }, // D3DFMT_A8R3G3B2
+ { DXGI_FORMAT_B5G6R5_UNORM, CONV_FLAGS_EXPAND
+ | CONV_FLAGS_332, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 8, 0xe0, 0x1c, 0x03, 0x00 } }, // D3DFMT_R3G3B2
+
+ { DXGI_FORMAT_R8_UNORM, CONV_FLAGS_NONE, DDSPF_L8 }, // D3DFMT_L8
+ { DXGI_FORMAT_R16_UNORM, CONV_FLAGS_NONE, DDSPF_L16 }, // D3DFMT_L16
+ { DXGI_FORMAT_R8G8_UNORM, CONV_FLAGS_NONE, DDSPF_A8L8 }, // D3DFMT_A8L8
+
+ { DXGI_FORMAT_A8_UNORM, CONV_FLAGS_NONE, DDSPF_A8 }, // D3DFMT_A8
+
+ { DXGI_FORMAT_R16G16B16A16_UNORM, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 36, 0, 0, 0, 0, 0 } }, // D3DFMT_A16B16G16R16
+ { DXGI_FORMAT_R16G16B16A16_SNORM, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 110, 0, 0, 0, 0, 0 } }, // D3DFMT_Q16W16V16U16
+ { DXGI_FORMAT_R16_FLOAT, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 111, 0, 0, 0, 0, 0 } }, // D3DFMT_R16F
+ { DXGI_FORMAT_R16G16_FLOAT, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 112, 0, 0, 0, 0, 0 } }, // D3DFMT_G16R16F
+ { DXGI_FORMAT_R16G16B16A16_FLOAT, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 113, 0, 0, 0, 0, 0 } }, // D3DFMT_A16B16G16R16F
+ { DXGI_FORMAT_R32_FLOAT, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 114, 0, 0, 0, 0, 0 } }, // D3DFMT_R32F
+ { DXGI_FORMAT_R32G32_FLOAT, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 115, 0, 0, 0, 0, 0 } }, // D3DFMT_G32R32F
+ { DXGI_FORMAT_R32G32B32A32_FLOAT, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_FOURCC, 116, 0, 0, 0, 0, 0 } }, // D3DFMT_A32B32G32R32F
+
+ { DXGI_FORMAT_R32_FLOAT, CONV_FLAGS_NONE, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 32, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 } }, // D3DFMT_R32F (D3DX uses FourCC 114 instead)
+
+ { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_EXPAND
+ | CONV_FLAGS_PAL8
+ | CONV_FLAGS_A8P8, { sizeof(DDS_PIXELFORMAT), DDS_PAL8, 0, 16, 0, 0, 0, 0 } }, // D3DFMT_A8P8
+ { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_EXPAND
+ | CONV_FLAGS_PAL8, { sizeof(DDS_PIXELFORMAT), DDS_PAL8, 0, 8, 0, 0, 0, 0 } }, // D3DFMT_P8
+
+#ifdef DXGI_1_2_FORMATS
+ { DXGI_FORMAT_B4G4R4A4_UNORM, CONV_FLAGS_4444, DDSPF_A4R4G4B4 }, // D3DFMT_A4R4G4B4 (uses DXGI 1.2 format)
+ { DXGI_FORMAT_B4G4R4A4_UNORM, CONV_FLAGS_NOALPHA
+ | CONV_FLAGS_4444, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 16, 0x0f00, 0x00f0, 0x000f, 0x0000 } }, // D3DFMT_X4R4G4B4 (uses DXGI 1.2 format)
+ { DXGI_FORMAT_B4G4R4A4_UNORM, CONV_FLAGS_EXPAND
+ | CONV_FLAGS_44, { sizeof(DDS_PIXELFORMAT), DDS_LUMINANCE, 0, 8, 0x0f, 0x00, 0x00, 0xf0 } }, // D3DFMT_A4L4 (uses DXGI 1.2 format)
+#else // !DXGI_1_2_FORMATS
+ { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_EXPAND
+ | CONV_FLAGS_4444, DDSPF_A4R4G4B4 }, // D3DFMT_A4R4G4B4
+ { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_EXPAND
+ | CONV_FLAGS_NOALPHA
+ | CONV_FLAGS_4444, { sizeof(DDS_PIXELFORMAT), DDS_RGB, 0, 16, 0x0f00, 0x00f0, 0x000f, 0x0000 } }, // D3DFMT_X4R4G4B4
+ { DXGI_FORMAT_R8G8B8A8_UNORM, CONV_FLAGS_EXPAND
+ | CONV_FLAGS_44, { sizeof(DDS_PIXELFORMAT), DDS_LUMINANCE, 0, 8, 0x0f, 0x00, 0x00, 0xf0 } }, // D3DFMT_A4L4
+#endif
+};
+
+// Note that many common DDS reader/writers (including D3DX) swap the
+// the RED/BLUE masks for 10:10:10:2 formats. We assumme
+// below that the 'backwards' header mask is being used since it is most
+// likely written by D3DX. The more robust solution is to use the 'DX10'
+// header extension and specify the DXGI_FORMAT_R10G10B10A2_UNORM format directly
+
+// We do not support the following legacy Direct3D 9 formats:
+// BumpDuDv D3DFMT_V8U8, D3DFMT_Q8W8V8U8, D3DFMT_V16U16, D3DFMT_A2W10V10U10
+// BumpLuminance D3DFMT_L6V5U5, D3DFMT_X8L8V8U8
+// FourCC "UYVY" D3DFMT_UYVY
+// FourCC "YUY2" D3DFMT_YUY2
+// FourCC 117 D3DFMT_CxV8U8
+// ZBuffer D3DFMT_D16_LOCKABLE
+// FourCC 82 D3DFMT_D32F_LOCKABLE
+
+static DXGI_FORMAT _GetDXGIFormat( const DDS_PIXELFORMAT& ddpf, DWORD flags, _Inout_opt_ DWORD* convFlags )
+{
+ const size_t MAP_SIZE = sizeof(g_LegacyDDSMap) / sizeof(LegacyDDS);
+ size_t index = 0;
+ for( index = 0; index < MAP_SIZE; ++index )
+ {
+ const LegacyDDS* entry = &g_LegacyDDSMap[index];
+
+ if ( ddpf.dwFlags & entry->ddpf.dwFlags )
+ {
+ if ( entry->ddpf.dwFlags & DDS_FOURCC )
+ {
+ if ( ddpf.dwFourCC == entry->ddpf.dwFourCC )
+ break;
+ }
+ else if ( entry->ddpf.dwFlags & DDS_PAL8 )
+ {
+ if ( ddpf.dwRGBBitCount == entry->ddpf.dwRGBBitCount )
+ break;
+ }
+ else if ( ddpf.dwRGBBitCount == entry->ddpf.dwRGBBitCount )
+ {
+ // RGB, RGBA, ALPHA, LUMINANCE
+ if ( ddpf.dwRBitMask == entry->ddpf.dwRBitMask
+ && ddpf.dwGBitMask == entry->ddpf.dwGBitMask
+ && ddpf.dwBBitMask == entry->ddpf.dwBBitMask
+ && ddpf.dwABitMask == entry->ddpf.dwABitMask )
+ break;
+ }
+ }
+ }
+
+ if ( index >= MAP_SIZE )
+ return DXGI_FORMAT_UNKNOWN;
+
+ DWORD cflags = g_LegacyDDSMap[index].convFlags;
+ DXGI_FORMAT format = g_LegacyDDSMap[index].format;
+
+ if ( (cflags & CONV_FLAGS_EXPAND) && (flags & DDS_FLAGS_NO_LEGACY_EXPANSION) )
+ return DXGI_FORMAT_UNKNOWN;
+
+ if ( (format == DXGI_FORMAT_R10G10B10A2_UNORM) && (flags & DDS_FLAGS_NO_R10B10G10A2_FIXUP) )
+ {
+ cflags ^= CONV_FLAGS_SWIZZLE;
+ }
+
+ if ( convFlags )
+ *convFlags = cflags;
+
+ return format;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Decodes DDS header including optional DX10 extended header
+//-------------------------------------------------------------------------------------
+static HRESULT _DecodeDDSHeader( _In_bytecount_(size) LPCVOID pSource, size_t size, DWORD flags, _Out_ TexMetadata& metadata,
+ _Inout_opt_ DWORD* convFlags )
+{
+ if ( !pSource )
+ return E_INVALIDARG;
+
+ memset( &metadata, 0, sizeof(TexMetadata) );
+
+ if ( size < (sizeof(DDS_HEADER) + sizeof(uint32_t)) )
+ {
+ return HRESULT_FROM_WIN32( ERROR_INVALID_DATA );
+ }
+
+ // DDS files always start with the same magic number ("DDS ")
+ uint32_t dwMagicNumber = *reinterpret_cast<const uint32_t*>(pSource);
+ if ( dwMagicNumber != DDS_MAGIC )
+ {
+ return E_FAIL;
+ }
+
+ const DDS_HEADER* pHeader = reinterpret_cast<const DDS_HEADER*>( (const uint8_t*)pSource + sizeof( uint32_t ) );
+ assert( pHeader );
+
+ // Verify header to validate DDS file
+ if ( pHeader->dwSize != sizeof(DDS_HEADER)
+ || pHeader->ddspf.dwSize != sizeof(DDS_PIXELFORMAT) )
+ {
+ return E_FAIL;
+ }
+
+ metadata.mipLevels = pHeader->dwMipMapCount;
+ if ( metadata.mipLevels == 0 )
+ metadata.mipLevels = 1;
+
+ // Check for DX10 extension
+ if ( (pHeader->ddspf.dwFlags & DDS_FOURCC)
+ && (MAKEFOURCC( 'D', 'X', '1', '0' ) == pHeader->ddspf.dwFourCC) )
+ {
+ // Buffer must be big enough for both headers and magic value
+ if ( size < (sizeof(DDS_HEADER)+sizeof(uint32_t)+sizeof(DDS_HEADER_DXT10)) )
+ {
+ return E_FAIL;
+ }
+
+ const DDS_HEADER_DXT10* d3d10ext = reinterpret_cast<const DDS_HEADER_DXT10*>( (const uint8_t*)pSource + sizeof( uint32_t ) + sizeof(DDS_HEADER) );
+ if ( convFlags )
+ *convFlags |= CONV_FLAGS_DX10;
+
+ metadata.arraySize = d3d10ext->arraySize;
+ if ( metadata.arraySize == 0 )
+ {
+ return HRESULT_FROM_WIN32( ERROR_INVALID_DATA );
+ }
+
+ metadata.format = d3d10ext->dxgiFormat;
+ if ( !IsValid( metadata.format ) )
+ {
+ HRESULT_FROM_WIN32( ERROR_INVALID_DATA );
+ }
+
+ switch ( d3d10ext->resourceDimension )
+ {
+ case DDS_DIMENSION_TEXTURE1D:
+
+ // D3DX writes 1D textures with a fixed Height of 1
+ if ( (pHeader->dwFlags & DDS_HEIGHT) && pHeader->dwHeight != 1 )
+ {
+ return HRESULT_FROM_WIN32( ERROR_INVALID_DATA );
+ }
+
+ metadata.width = pHeader->dwWidth;
+ metadata.height = 1;
+ metadata.depth = 1;
+ metadata.dimension = TEX_DIMENSION_TEXTURE1D;
+ break;
+
+ case DDS_DIMENSION_TEXTURE2D:
+ if ( d3d10ext->miscFlag & DDS_RESOURCE_MISC_TEXTURECUBE )
+ {
+ metadata.miscFlags |= TEX_MISC_TEXTURECUBE;
+ metadata.arraySize *= 6;
+ }
+
+ metadata.width = pHeader->dwWidth;
+ metadata.height = pHeader->dwHeight;
+ metadata.depth = 1;
+ metadata.dimension = TEX_DIMENSION_TEXTURE2D;
+ break;
+
+ case DDS_DIMENSION_TEXTURE3D:
+ if ( !(pHeader->dwFlags & DDS_HEADER_FLAGS_VOLUME) )
+ {
+ return HRESULT_FROM_WIN32( ERROR_INVALID_DATA );
+ }
+
+ if ( metadata.arraySize > 1 )
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+
+ metadata.width = pHeader->dwWidth;
+ metadata.height = pHeader->dwHeight;
+ metadata.depth = pHeader->dwDepth;
+ metadata.dimension = TEX_DIMENSION_TEXTURE3D;
+ break;
+
+ default:
+ return HRESULT_FROM_WIN32( ERROR_INVALID_DATA );
+ }
+ }
+ else
+ {
+ metadata.arraySize = 1;
+
+ if ( pHeader->dwFlags & DDS_HEADER_FLAGS_VOLUME )
+ {
+ metadata.width = pHeader->dwWidth;
+ metadata.height = pHeader->dwHeight;
+ metadata.depth = pHeader->dwDepth;
+ metadata.dimension = TEX_DIMENSION_TEXTURE3D;
+ }
+ else
+ {
+ if ( pHeader->dwCaps2 & DDS_CUBEMAP )
+ {
+ // We require all six faces to be defined
+ if ( (pHeader->dwCaps2 & DDS_CUBEMAP_ALLFACES ) != DDS_CUBEMAP_ALLFACES )
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+
+ metadata.arraySize = 6;
+ metadata.miscFlags |= TEX_MISC_TEXTURECUBE;
+ }
+
+ metadata.width = pHeader->dwWidth;
+ metadata.height = pHeader->dwHeight;
+ metadata.depth = 1;
+ metadata.dimension = TEX_DIMENSION_TEXTURE2D;
+
+ // Note there's no way for a legacy Direct3D 9 DDS to express a '1D' texture
+ }
+
+ metadata.format = _GetDXGIFormat( pHeader->ddspf, flags, convFlags );
+
+ if ( metadata.format == DXGI_FORMAT_UNKNOWN )
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ // Special flag for handling BGR DXGI 1.1 formats
+ if (flags & DDS_FLAGS_FORCE_RGB)
+ {
+ switch ( metadata.format )
+ {
+ case DXGI_FORMAT_B8G8R8A8_UNORM:
+ metadata.format = DXGI_FORMAT_R8G8B8A8_UNORM;
+ if ( convFlags )
+ *convFlags |= CONV_FLAGS_SWIZZLE;
+ break;
+
+ case DXGI_FORMAT_B8G8R8X8_UNORM:
+ metadata.format = DXGI_FORMAT_R8G8B8A8_UNORM;
+ if ( convFlags )
+ *convFlags |= CONV_FLAGS_SWIZZLE | CONV_FLAGS_NOALPHA;
+ break;
+
+ case DXGI_FORMAT_B8G8R8A8_TYPELESS:
+ metadata.format = DXGI_FORMAT_R8G8B8A8_TYPELESS;
+ if ( convFlags )
+ *convFlags |= CONV_FLAGS_SWIZZLE;
+ break;
+
+ case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+ metadata.format = DXGI_FORMAT_R8G8B8A8_UNORM_SRGB;
+ if ( convFlags )
+ *convFlags |= CONV_FLAGS_SWIZZLE;
+ break;
+
+ case DXGI_FORMAT_B8G8R8X8_TYPELESS:
+ metadata.format = DXGI_FORMAT_R8G8B8A8_TYPELESS;
+ if ( convFlags )
+ *convFlags |= CONV_FLAGS_SWIZZLE | CONV_FLAGS_NOALPHA;
+ break;
+
+ case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
+ metadata.format = DXGI_FORMAT_R8G8B8A8_UNORM_SRGB;
+ if ( convFlags )
+ *convFlags |= CONV_FLAGS_SWIZZLE | CONV_FLAGS_NOALPHA;
+ break;
+ }
+ }
+
+ // Special flag for handling 16bpp formats
+ if (flags & DDS_FLAGS_NO_16BPP)
+ {
+ switch ( metadata.format )
+ {
+ case DXGI_FORMAT_B5G6R5_UNORM:
+ case DXGI_FORMAT_B5G5R5A1_UNORM:
+#ifdef DXGI_1_2_FORMATS
+ case DXGI_FORMAT_B4G4R4A4_UNORM:
+#endif
+ metadata.format = DXGI_FORMAT_R8G8B8A8_UNORM;
+ if ( convFlags )
+ {
+ *convFlags |= CONV_FLAGS_EXPAND;
+ if ( metadata.format == DXGI_FORMAT_B5G6R5_UNORM )
+ *convFlags |= CONV_FLAGS_NOALPHA;
+ }
+ }
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Encodes DDS file header (magic value, header, optional DX10 extended header)
+//-------------------------------------------------------------------------------------
+HRESULT _EncodeDDSHeader( _In_ const TexMetadata& metadata, DWORD flags,
+ _Out_opt_cap_x_(maxsize) LPVOID pDestination, _In_ size_t maxsize, _Out_ size_t& required )
+{
+ assert( IsValid( metadata.format ) && !IsVideo( metadata.format ) );
+
+ if ( metadata.arraySize > 1 )
+ {
+ if ( (metadata.arraySize != 6) || (metadata.dimension != TEX_DIMENSION_TEXTURE2D) || !(metadata.miscFlags & TEX_MISC_TEXTURECUBE) )
+ {
+ flags |= DDS_FLAGS_FORCE_DX10_EXT;
+ }
+ }
+
+ DDS_PIXELFORMAT ddpf = { 0 };
+ if ( !(flags & DDS_FLAGS_FORCE_DX10_EXT) )
+ {
+ switch( metadata.format )
+ {
+ case DXGI_FORMAT_R8G8B8A8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_A8B8G8R8, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_R16G16_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_G16R16, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_R8G8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_A8L8, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_R16_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_L16, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_R8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_L8, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_A8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_A8, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_R8G8_B8G8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_R8G8_B8G8, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_G8R8_G8B8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_G8R8_G8B8, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_BC1_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_DXT1, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_BC2_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_DXT3, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_BC3_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_DXT5, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_BC4_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_BC4_UNORM, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_BC4_SNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_BC4_SNORM, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_BC5_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_BC5_UNORM, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_BC5_SNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_BC5_SNORM, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_B5G6R5_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_R5G6B5, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_B5G5R5A1_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_A1R5G5B5, sizeof(DDS_PIXELFORMAT) ); break;
+ case DXGI_FORMAT_B8G8R8A8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_A8R8G8B8, sizeof(DDS_PIXELFORMAT) ); break; // DXGI 1.1
+ case DXGI_FORMAT_B8G8R8X8_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_X8R8G8B8, sizeof(DDS_PIXELFORMAT) ); break; // DXGI 1.1
+
+#ifdef DXGI_1_2_FORMATS
+ case DXGI_FORMAT_B4G4R4A4_UNORM: memcpy_s( &ddpf, sizeof(ddpf), &DDSPF_A4R4G4B4, sizeof(DDS_PIXELFORMAT) ); break;
+#endif
+
+ // Legacy D3DX formats using D3DFMT enum value as FourCC
+ case DXGI_FORMAT_R32G32B32A32_FLOAT:
+ ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 116; // D3DFMT_A32B32G32R32F
+ break;
+ case DXGI_FORMAT_R16G16B16A16_FLOAT:
+ ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 113; // D3DFMT_A16B16G16R16F
+ break;
+ case DXGI_FORMAT_R16G16B16A16_UNORM:
+ ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 36; // D3DFMT_A16B16G16R16
+ break;
+ case DXGI_FORMAT_R16G16B16A16_SNORM:
+ ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 110; // D3DFMT_Q16W16V16U16
+ break;
+ case DXGI_FORMAT_R32G32_FLOAT:
+ ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 115; // D3DFMT_G32R32F
+ break;
+ case DXGI_FORMAT_R16G16_FLOAT:
+ ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 112; // D3DFMT_G16R16F
+ break;
+ case DXGI_FORMAT_R32_FLOAT:
+ ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 114; // D3DFMT_R32F
+ break;
+ case DXGI_FORMAT_R16_FLOAT:
+ ddpf.dwSize = sizeof(DDS_PIXELFORMAT); ddpf.dwFlags = DDS_FOURCC; ddpf.dwFourCC = 111; // D3DFMT_R16F
+ break;
+ }
+ }
+
+ required = sizeof(uint32_t) + sizeof(DDS_HEADER);
+
+ if ( ddpf.dwSize == 0 )
+ required += sizeof(DDS_HEADER_DXT10);
+
+ if ( !pDestination )
+ return S_OK;
+
+ if ( maxsize < required )
+ return E_NOT_SUFFICIENT_BUFFER;
+
+ *reinterpret_cast<uint32_t*>(pDestination) = DDS_MAGIC;
+
+ DDS_HEADER* header = reinterpret_cast<DDS_HEADER*>( reinterpret_cast<uint8_t*>(pDestination) + sizeof(uint32_t) );
+ assert( header );
+
+ memset( header, 0, sizeof(DDS_HEADER ) );
+ header->dwSize = sizeof( DDS_HEADER );
+ header->dwFlags = DDS_HEADER_FLAGS_TEXTURE;
+ header->dwCaps = DDS_SURFACE_FLAGS_TEXTURE;
+
+ if (metadata.mipLevels > 0)
+ {
+ header->dwFlags |= DDS_HEADER_FLAGS_MIPMAP;
+
+#ifdef _AMD64_
+ if ( metadata.mipLevels > 0xFFFFFFFF )
+ return E_INVALIDARG;
+#endif
+
+ header->dwMipMapCount = static_cast<uint32_t>( metadata.mipLevels );
+
+ if ( header->dwMipMapCount > 1 )
+ header->dwCaps |= DDS_SURFACE_FLAGS_MIPMAP;
+ }
+
+ switch( metadata.dimension )
+ {
+ case TEX_DIMENSION_TEXTURE1D:
+#ifdef _AMD64_
+ if ( metadata.height > 0xFFFFFFFF )
+ return E_INVALIDARG;
+#endif
+
+ header->dwWidth = static_cast<uint32_t>( metadata.width );
+ header->dwHeight = header->dwDepth = 1;
+ break;
+
+ case TEX_DIMENSION_TEXTURE2D:
+#ifdef _AMD64_
+ if ( metadata.height > 0xFFFFFFFF
+ || metadata.width > 0xFFFFFFFF)
+ return E_INVALIDARG;
+#endif
+
+ header->dwHeight = static_cast<uint32_t>( metadata.height );
+ header->dwWidth = static_cast<uint32_t>( metadata.width );
+ header->dwDepth = 1;
+
+ if ( metadata.miscFlags & TEX_MISC_TEXTURECUBE )
+ {
+ header->dwCaps |= DDS_SURFACE_FLAGS_CUBEMAP;
+ header->dwCaps2 |= DDS_CUBEMAP_ALLFACES;
+ }
+ break;
+
+ case TEX_DIMENSION_TEXTURE3D:
+#ifdef _AMD64_
+ if ( metadata.height > 0xFFFFFFFF
+ || metadata.width > 0xFFFFFFFF
+ || metadata.depth > 0xFFFFFFFF )
+ return E_INVALIDARG;
+#endif
+
+ header->dwFlags |= DDS_HEADER_FLAGS_VOLUME;
+ header->dwCaps2 |= DDS_FLAGS_VOLUME;
+ header->dwHeight = static_cast<uint32_t>( metadata.height );
+ header->dwWidth = static_cast<uint32_t>( metadata.width );
+ header->dwDepth = static_cast<uint32_t>( metadata.depth );
+ break;
+
+ default:
+ return E_FAIL;
+ }
+
+ size_t rowPitch, slicePitch;
+ ComputePitch( metadata.format, metadata.width, metadata.height, rowPitch, slicePitch, CP_FLAGS_NONE );
+
+#ifdef _AMD64_
+ if ( slicePitch > 0xFFFFFFFF
+ || rowPitch > 0xFFFFFFFF )
+ return E_FAIL;
+#endif
+
+ if ( IsCompressed( metadata.format ) )
+ {
+ header->dwFlags |= DDS_HEADER_FLAGS_LINEARSIZE;
+ header->dwPitchOrLinearSize = static_cast<uint32_t>( slicePitch );
+ }
+ else
+ {
+ header->dwFlags |= DDS_HEADER_FLAGS_PITCH;
+ header->dwPitchOrLinearSize = static_cast<uint32_t>( rowPitch );
+ }
+
+ if ( ddpf.dwSize == 0 )
+ {
+ memcpy_s( &header->ddspf, sizeof(header->ddspf), &DDSPF_DX10, sizeof(DDS_PIXELFORMAT) );
+
+ DDS_HEADER_DXT10* ext = reinterpret_cast<DDS_HEADER_DXT10*>( reinterpret_cast<uint8_t*>(header) + sizeof(DDS_HEADER) );
+ assert( ext );
+
+ memset( ext, 0, sizeof(DDS_HEADER_DXT10) );
+ ext->dxgiFormat = metadata.format;
+ ext->resourceDimension = metadata.dimension;
+
+#ifdef _AMD64_
+ if ( metadata.arraySize > 0xFFFFFFFF )
+ return E_INVALIDARG;
+#endif
+
+ if ( metadata.miscFlags & TEX_MISC_TEXTURECUBE )
+ {
+ ext->miscFlag |= TEX_MISC_TEXTURECUBE;
+ assert( (metadata.arraySize % 6) == 0 );
+ ext->arraySize = static_cast<UINT>( metadata.arraySize / 6 );
+ }
+ else
+ {
+ ext->arraySize = static_cast<UINT>( metadata.arraySize );
+ }
+ }
+ else
+ {
+ memcpy_s( &header->ddspf, sizeof(header->ddspf), &ddpf, sizeof(ddpf) );
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Converts an image row with optional clearing of alpha value to 1.0
+// Returns true if supported, false if expansion case not supported
+//-------------------------------------------------------------------------------------
+enum TEXP_LEGACY_FORMAT
+{
+ TEXP_LEGACY_UNKNOWN = 0,
+ TEXP_LEGACY_R8G8B8,
+ TEXP_LEGACY_R3G3B2,
+ TEXP_LEGACY_A8R3G3B2,
+ TEXP_LEGACY_P8,
+ TEXP_LEGACY_A8P8,
+ TEXP_LEGACY_A4L4,
+ TEXP_LEGACY_B4G4R4A4,
+};
+
+inline static TEXP_LEGACY_FORMAT _FindLegacyFormat( DWORD flags )
+{
+ TEXP_LEGACY_FORMAT lformat = TEXP_LEGACY_UNKNOWN;
+
+ if ( flags & CONV_FLAGS_PAL8 )
+ {
+ lformat = ( flags & CONV_FLAGS_A8P8 ) ? TEXP_LEGACY_A8P8 : TEXP_LEGACY_P8;
+ }
+ else if ( flags & CONV_FLAGS_888 )
+ lformat = TEXP_LEGACY_R8G8B8;
+ else if ( flags & CONV_FLAGS_332 )
+ lformat = TEXP_LEGACY_R3G3B2;
+ else if ( flags & CONV_FLAGS_8332 )
+ lformat = TEXP_LEGACY_A8R3G3B2;
+ else if ( flags & CONV_FLAGS_44 )
+ lformat = TEXP_LEGACY_A4L4;
+#ifndef DXGI_1_2_FORMATS
+ else if ( flags & CONV_FLAGS_4444 )
+ lformat = TEXP_LEGACY_B4G4R4A4;
+#endif
+
+ return lformat;
+}
+
+static bool _LegacyExpandScanline( _Out_bytecap_(outSize) LPVOID pDestination, size_t outSize, _In_ DXGI_FORMAT outFormat,
+ _In_bytecount_(inSize) LPCVOID pSource, size_t inSize, _In_ TEXP_LEGACY_FORMAT inFormat,
+ _In_opt_count_c_(256) const uint32_t* pal8, _In_ DWORD flags )
+{
+ assert( pDestination && outSize > 0 );
+ assert( pSource && inSize > 0 );
+ assert( IsValid(outFormat) && !IsVideo(outFormat) );
+
+ switch( inFormat )
+ {
+ case TEXP_LEGACY_R8G8B8:
+ if ( outFormat != DXGI_FORMAT_R8G8B8A8_UNORM )
+ return false;
+
+ // D3DFMT_R8G8B8 -> DXGI_FORMAT_R8G8B8A8_UNORM
+ {
+ const uint8_t * __restrict sPtr = reinterpret_cast<const uint8_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+
+ for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); icount += 3, ocount += 4 )
+ {
+ // 24bpp Direct3D 9 files are actually BGR, so need to swizzle as well
+ uint32_t t1 = ( *(sPtr) << 16 );
+ uint32_t t2 = ( *(sPtr+1) << 8 );
+ uint32_t t3 = *(sPtr+2);
+
+ *(dPtr++) = t1 | t2 | t3 | 0xff000000;
+ sPtr += 3;
+ }
+ }
+ return true;
+
+ case TEXP_LEGACY_R3G3B2:
+ switch( outFormat )
+ {
+ case DXGI_FORMAT_R8G8B8A8_UNORM:
+ // D3DFMT_R3G3B2 -> DXGI_FORMAT_R8G8B8A8_UNORM
+ {
+ const uint8_t* __restrict sPtr = reinterpret_cast<const uint8_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+
+ for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); ++icount, ocount += 4 )
+ {
+ uint8_t t = *(sPtr++);
+
+ uint32_t t1 = (t & 0xe0) | ((t & 0xe0) >> 3) | ((t & 0xc0) >> 6);
+ uint32_t t2 = ((t & 0x1c) << 11) | ((t & 0x1c) << 8) | ((t & 0x18) << 5);
+ uint32_t t3 = ((t & 0x03) << 22) | ((t & 0x03) << 20) | ((t & 0x03) << 18) | ((t & 0x03) << 16);
+
+ *(dPtr++) = t1 | t2 | t3 | 0xff000000;
+ }
+ }
+ return true;
+
+ case DXGI_FORMAT_B5G6R5_UNORM:
+ // D3DFMT_R3G3B2 -> DXGI_FORMAT_B5G6R5_UNORM
+ {
+ const uint8_t* __restrict sPtr = reinterpret_cast<const uint8_t*>(pSource);
+ uint16_t * __restrict dPtr = reinterpret_cast<uint16_t*>(pDestination);
+
+ for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); ++icount, ocount += 2 )
+ {
+ uint8_t t = *(sPtr++);
+
+ uint16_t t1 = ((t & 0xe0) << 8) | ((t & 0xc0) << 5);
+ uint16_t t2 = ((t & 0x1c) << 6) | ((t & 0x1c) << 3);
+ uint16_t t3 = ((t & 0x03) << 3) | ((t & 0x03) << 1) | ((t & 0x02) >> 1);
+
+ *(dPtr++) = t1 | t2 | t3;
+ }
+ }
+ return true;
+ }
+ break;
+
+ case TEXP_LEGACY_A8R3G3B2:
+ if ( outFormat != DXGI_FORMAT_R8G8B8A8_UNORM )
+ return false;
+
+ // D3DFMT_A8R3G3B2 -> DXGI_FORMAT_R8G8B8A8_UNORM
+ {
+ const uint16_t* __restrict sPtr = reinterpret_cast<const uint16_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+
+ for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); icount += 2, ocount += 4 )
+ {
+ uint16_t t = *(sPtr++);
+
+ uint32_t t1 = (t & 0x00e0) | ((t & 0x00e0) >> 3) | ((t & 0x00c0) >> 6);
+ uint32_t t2 = ((t & 0x001c) << 11) | ((t & 0x001c) << 8) | ((t & 0x0018) << 5);
+ uint32_t t3 = ((t & 0x0003) << 22) | ((t & 0x0003) << 20) | ((t & 0x0003) << 18) | ((t & 0x0003) << 16);
+ uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : ((t & 0xff00) << 16);
+
+ *(dPtr++) = t1 | t2 | t3 | ta;
+ }
+ }
+ return true;
+
+ case TEXP_LEGACY_P8:
+ if ( (outFormat != DXGI_FORMAT_R8G8B8A8_UNORM) || !pal8 )
+ return false;
+
+ // D3DFMT_P8 -> DXGI_FORMAT_R8G8B8A8_UNORM
+ {
+ const uint8_t* __restrict sPtr = reinterpret_cast<const uint8_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+
+ for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); ++icount, ocount += 4 )
+ {
+ uint8_t t = *(sPtr++);
+
+ *(dPtr++) = pal8[ t ];
+ }
+ }
+ return true;
+
+ case TEXP_LEGACY_A8P8:
+ if ( (outFormat != DXGI_FORMAT_R8G8B8A8_UNORM) || !pal8 )
+ return false;
+
+ // D3DFMT_A8P8 -> DXGI_FORMAT_R8G8B8A8_UNORM
+ {
+ const uint16_t* __restrict sPtr = reinterpret_cast<const uint16_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+
+ for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); icount += 2, ocount += 4 )
+ {
+ uint16_t t = *(sPtr++);
+
+ uint32_t t1 = pal8[ t & 0xff ];
+ uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : ((t & 0xff00) << 16);
+
+ *(dPtr++) = t1 | ta;
+ }
+ }
+ return true;
+
+ case TEXP_LEGACY_A4L4:
+ switch( outFormat )
+ {
+#ifdef DXGI_1_2_FORMATS
+ case DXGI_FORMAT_B4G4R4A4_UNORM :
+ // D3DFMT_A4L4 -> DXGI_FORMAT_B4G4R4A4_UNORM
+ {
+ const uint8_t * __restrict sPtr = reinterpret_cast<const uint8_t*>(pSource);
+ uint16_t * __restrict dPtr = reinterpret_cast<uint16_t*>(pDestination);
+
+ for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); ++icount, ocount += 2 )
+ {
+ uint8_t t = *(sPtr++);
+
+ uint16_t t1 = (t & 0x0f);
+ uint16_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xf000 : ((t & 0xf0) << 8);
+
+ *(dPtr++) = t1 | (t1 << 4) | (t1 << 8) | ta;
+ }
+ }
+ return true;
+#endif // DXGI_1_2_FORMATS
+
+ case DXGI_FORMAT_R8G8B8A8_UNORM:
+ // D3DFMT_A4L4 -> DXGI_FORMAT_R8G8B8A8_UNORM
+ {
+ const uint8_t * __restrict sPtr = reinterpret_cast<const uint8_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+
+ for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); ++icount, ocount += 4 )
+ {
+ uint8_t t = *(sPtr++);
+
+ uint32_t t1 = ((t & 0x0f) << 4) | (t & 0x0f);
+ uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : (((t & 0xf0) << 24) | ((t & 0xf0) << 20));
+
+ *(dPtr++) = t1 | (t1 << 8) | (t1 << 16) | ta;
+ }
+ }
+ return true;
+ }
+ break;
+
+#ifndef DXGI_1_2_FORMATS
+ case TEXP_LEGACY_B4G4R4A4:
+ if (outFormat != DXGI_FORMAT_R8G8B8A8_UNORM)
+ return false;
+
+ // D3DFMT_A4R4G4B4 -> DXGI_FORMAT_R8G8B8A8_UNORM
+ {
+ const uint16_t * __restrict sPtr = reinterpret_cast<const uint16_t*>(pSource);
+ uint32_t * __restrict dPtr = reinterpret_cast<uint32_t*>(pDestination);
+
+ for( size_t ocount = 0, icount = 0; ((icount < inSize) && (ocount < outSize)); icount += 2, ocount += 4 )
+ {
+ uint16_t t = *(sPtr++);
+
+ uint32_t t1 = ((t & 0x0f00) >> 4) | ((t & 0x0f00) >> 8);
+ uint32_t t2 = ((t & 0x00f0) << 8) | ((t & 0x00f0) << 4);
+ uint32_t t3 = ((t & 0x000f) << 20) | ((t & 0x000f) << 16);
+ uint32_t ta = ( flags & TEXP_SCANLINE_SETALPHA ) ? 0xff000000 : (((t & 0xf000) << 16) | ((t & 0xf000) << 12));
+
+ *(dPtr++) = t1 | t2 | t3 | ta;
+ }
+ }
+ return true;
+#endif
+ }
+
+ return false;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Converts or copies image data from pPixels into scratch image data
+//-------------------------------------------------------------------------------------
+static HRESULT _CopyImage( _In_bytecount_(size) const void* pPixels, _In_ size_t size,
+ _In_ const TexMetadata& metadata, _In_ DWORD cpFlags, _In_ DWORD convFlags, _In_opt_count_c_(256) const uint32_t *pal8, _In_ const ScratchImage& image )
+{
+ assert( pPixels );
+ assert( image.GetPixels() );
+
+ if ( !size )
+ return E_FAIL;
+
+ if ( convFlags & CONV_FLAGS_EXPAND )
+ {
+ if ( convFlags & CONV_FLAGS_888 )
+ cpFlags |= CP_FLAGS_24BPP;
+ else if ( convFlags & (CONV_FLAGS_565 | CONV_FLAGS_5551 | CONV_FLAGS_4444 | CONV_FLAGS_8332 | CONV_FLAGS_A8P8 ) )
+ cpFlags |= CP_FLAGS_16BPP;
+ else if ( convFlags & (CONV_FLAGS_44 | CONV_FLAGS_332 | CONV_FLAGS_PAL8) )
+ cpFlags |= CP_FLAGS_8BPP;
+ }
+
+ size_t pixelSize, nimages;
+ _DetermineImageArray( metadata, cpFlags, nimages, pixelSize );
+ if ( (nimages == 0) || (nimages != image.GetImageCount()) )
+ {
+ return E_FAIL;
+ }
+
+ assert( pixelSize <= size );
+
+ std::unique_ptr<Image[]> timages( new Image[nimages] );
+ if ( !_SetupImageArray( (uint8_t*)pPixels, size, metadata, cpFlags, timages.get(), nimages ) )
+ {
+ return E_FAIL;
+ }
+
+ if ( nimages != image.GetImageCount() )
+ {
+ return E_FAIL;
+ }
+
+ const Image* images = image.GetImages();
+ if ( !images )
+ {
+ return E_FAIL;
+ }
+
+ DWORD tflags = (convFlags & CONV_FLAGS_NOALPHA) ? TEXP_SCANLINE_SETALPHA : 0;
+ if ( convFlags & CONV_FLAGS_SWIZZLE )
+ tflags |= TEXP_SCANLINE_LEGACY;
+
+ switch (metadata.dimension)
+ {
+ case TEX_DIMENSION_TEXTURE1D:
+ case TEX_DIMENSION_TEXTURE2D:
+ {
+ size_t index = 0;
+ for( size_t item = 0; item < metadata.arraySize; ++item )
+ {
+ for( size_t level = 0; level < metadata.mipLevels; ++level, ++index )
+ {
+ if ( index >= nimages )
+ return E_FAIL;
+
+ if ( images[ index ].height != timages[ index ].height )
+ return E_FAIL;
+
+ size_t dpitch = images[ index ].rowPitch;
+ size_t spitch = timages[ index ].rowPitch;
+
+ const uint8_t *pSrc = const_cast<const uint8_t*>( timages[ index ].pixels );
+ if ( !pSrc )
+ return E_POINTER;
+
+ uint8_t *pDest = images[ index ].pixels;
+ if ( !pDest )
+ return E_POINTER;
+
+ if ( IsCompressed( metadata.format ) )
+ {
+ size_t csize = std::min<size_t>( images[ index ].slicePitch, timages[ index ].slicePitch );
+ memcpy_s( pDest, images[ index ].slicePitch, pSrc, csize );
+ }
+ else
+ {
+ for( size_t h = 0; h < images[ index ].height; ++h )
+ {
+ if ( convFlags & CONV_FLAGS_EXPAND )
+ {
+#ifdef DXGI_1_2_FORMATS
+ if ( convFlags & (CONV_FLAGS_565|CONV_FLAGS_5551|CONV_FLAGS_4444) )
+#else
+ if ( convFlags & (CONV_FLAGS_565|CONV_FLAGS_5551) )
+#endif
+ {
+ if ( !_ExpandScanline( pDest, dpitch, DXGI_FORMAT_R8G8B8A8_UNORM,
+ pSrc, spitch,
+ (convFlags & CONV_FLAGS_565) ? DXGI_FORMAT_B5G6R5_UNORM : DXGI_FORMAT_B5G5R5A1_UNORM,
+ tflags ) )
+ return E_FAIL;
+ }
+ else
+ {
+ TEXP_LEGACY_FORMAT lformat = _FindLegacyFormat( convFlags );
+ if ( !_LegacyExpandScanline( pDest, dpitch, metadata.format,
+ pSrc, spitch, lformat, pal8,
+ tflags ) )
+ return E_FAIL;
+ }
+ }
+ else if ( convFlags & CONV_FLAGS_SWIZZLE )
+ {
+ _SwizzleScanline( pDest, dpitch, pSrc, spitch,
+ metadata.format, tflags );
+ }
+ else
+ {
+ _CopyScanline( pDest, dpitch, pSrc, spitch,
+ metadata.format, tflags );
+ }
+
+ pSrc += spitch;
+ pDest += dpitch;
+ }
+ }
+ }
+ }
+ }
+ break;
+
+ case TEX_DIMENSION_TEXTURE3D:
+ {
+ size_t index = 0;
+ size_t d = metadata.depth;
+
+ for( size_t level = 0; level < metadata.mipLevels; ++level )
+ {
+ for( size_t slice = 0; slice < d; ++slice, ++index )
+ {
+ if ( index >= nimages )
+ return E_FAIL;
+
+ if ( images[ index ].height != timages[ index ].height )
+ return E_FAIL;
+
+ size_t dpitch = images[ index ].rowPitch;
+ size_t spitch = timages[ index ].rowPitch;
+
+ const uint8_t *pSrc = const_cast<const uint8_t*>( timages[ index ].pixels );
+ if ( !pSrc )
+ return E_POINTER;
+
+ uint8_t *pDest = images[ index ].pixels;
+ if ( !pDest )
+ return E_POINTER;
+
+ if ( IsCompressed( metadata.format ) )
+ {
+ size_t csize = std::min<size_t>( images[ index ].slicePitch, timages[ index ].slicePitch );
+ memcpy_s( pDest, images[ index ].slicePitch, pSrc, csize );
+ }
+ else
+ {
+ for( size_t h = 0; h < images[ index ].height; ++h )
+ {
+ if ( convFlags & CONV_FLAGS_EXPAND )
+ {
+#ifdef DXGI_1_2_FORMATS
+ if ( convFlags & (CONV_FLAGS_565|CONV_FLAGS_5551|CONV_FLAGS_4444) )
+#else
+ if ( convFlags & (CONV_FLAGS_565|CONV_FLAGS_5551) )
+#endif
+ {
+ if ( !_ExpandScanline( pDest, dpitch, DXGI_FORMAT_R8G8B8A8_UNORM,
+ pSrc, spitch,
+ (convFlags & CONV_FLAGS_565) ? DXGI_FORMAT_B5G6R5_UNORM : DXGI_FORMAT_B5G5R5A1_UNORM,
+ tflags ) )
+ return E_FAIL;
+ }
+ else
+ {
+ TEXP_LEGACY_FORMAT lformat = _FindLegacyFormat( convFlags );
+ if ( !_LegacyExpandScanline( pDest, dpitch, metadata.format,
+ pSrc, spitch, lformat, pal8,
+ tflags ) )
+ return E_FAIL;
+ }
+ }
+ else if ( convFlags & CONV_FLAGS_SWIZZLE )
+ {
+ _SwizzleScanline( pDest, dpitch, pSrc, spitch, metadata.format, tflags );
+ }
+ else
+ {
+ _CopyScanline( pDest, dpitch, pSrc, spitch, metadata.format, tflags );
+ }
+
+ pSrc += spitch;
+ pDest += dpitch;
+ }
+ }
+ }
+
+ if ( d > 1 )
+ d >>= 1;
+ }
+ }
+ break;
+
+ default:
+ return E_FAIL;
+ }
+
+ return S_OK;
+}
+
+static HRESULT _CopyImageInPlace( DWORD convFlags, _In_ const ScratchImage& image )
+{
+ if ( !image.GetPixels() )
+ return E_FAIL;
+
+ const Image* images = image.GetImages();
+ if ( !images )
+ return E_FAIL;
+
+ const TexMetadata& metadata = image.GetMetadata();
+
+ DWORD tflags = (convFlags & CONV_FLAGS_NOALPHA) ? TEXP_SCANLINE_SETALPHA : 0;
+ if ( convFlags & CONV_FLAGS_SWIZZLE )
+ tflags |= TEXP_SCANLINE_LEGACY;
+
+ for( size_t i = 0; i < image.GetImageCount(); ++i )
+ {
+ const Image* img = &images[ i ];
+ uint8_t *pPixels = img->pixels;
+ if ( !pPixels )
+ return E_POINTER;
+
+ size_t rowPitch = img->rowPitch;
+
+ for( size_t h = 0; h < img->height; ++h )
+ {
+ if ( convFlags & CONV_FLAGS_SWIZZLE )
+ {
+ _SwizzleScanline( pPixels, rowPitch, pPixels, rowPitch, metadata.format, tflags );
+ }
+ else
+ {
+ _CopyScanline( pPixels, rowPitch, pPixels, rowPitch, metadata.format, tflags );
+ }
+
+ pPixels += rowPitch;
+ }
+ }
+
+ return S_OK;
+}
+
+
+//=====================================================================================
+// Entry-points
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// Obtain metadata from DDS file in memory/on disk
+//-------------------------------------------------------------------------------------
+
+HRESULT GetMetadataFromDDSMemory( LPCVOID pSource, size_t size, DWORD flags, TexMetadata& metadata )
+{
+ if ( !pSource || size == 0 )
+ return E_INVALIDARG;
+
+ return _DecodeDDSHeader( pSource, size, flags, metadata, 0 );
+}
+
+HRESULT GetMetadataFromDDSFile( LPCWSTR szFile, DWORD flags, TexMetadata& metadata )
+{
+ if ( !szFile )
+ return E_INVALIDARG;
+
+#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/)
+ ScopedHandle hFile( safe_handle( CreateFile2( szFile, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, 0 ) ) );
+#else
+ ScopedHandle hFile( safe_handle( CreateFileW( szFile, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING,
+ FILE_FLAG_SEQUENTIAL_SCAN, 0 ) ) );
+#endif
+ if ( !hFile )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ // Get the file size
+ LARGE_INTEGER fileSize = {0};
+
+#if (_WIN32_WINNT >= _WIN32_WINNT_VISTA)
+ FILE_STANDARD_INFO fileInfo;
+ if ( !GetFileInformationByHandleEx( hFile.get(), FileStandardInfo, &fileInfo, sizeof(fileInfo) ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+ fileSize = fileInfo.EndOfFile;
+#else
+ if ( !GetFileSizeEx( hFile.get(), &fileSize ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+#endif
+
+ // File is too big for 32-bit allocation, so reject read (4 GB should be plenty large enough for a valid DDS file)
+ if ( fileSize.HighPart > 0 )
+ {
+ return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE );
+ }
+
+ // Need at least enough data to fill the standard header and magic number to be a valid DDS
+ if ( fileSize.LowPart < ( sizeof(DDS_HEADER) + sizeof(uint32_t) ) )
+ {
+ return E_FAIL;
+ }
+
+ // Read the header in (including extended header if present)
+ const size_t MAX_HEADER_SIZE = sizeof(uint32_t) + sizeof(DDS_HEADER) + sizeof(DDS_HEADER_DXT10);
+ uint8_t header[MAX_HEADER_SIZE];
+
+ DWORD bytesRead = 0;
+ if ( !ReadFile( hFile.get(), header, MAX_HEADER_SIZE, &bytesRead, 0 ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ return _DecodeDDSHeader( header, bytesRead, flags, metadata, 0 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Load a DDS file in memory
+//-------------------------------------------------------------------------------------
+HRESULT LoadFromDDSMemory( LPCVOID pSource, size_t size, DWORD flags, TexMetadata* metadata, ScratchImage& image )
+{
+ if ( !pSource || size == 0 )
+ return E_INVALIDARG;
+
+ image.Release();
+
+ DWORD convFlags = 0;
+ TexMetadata mdata;
+ HRESULT hr = _DecodeDDSHeader( pSource, size, flags, mdata, &convFlags );
+ if ( FAILED(hr) )
+ return hr;
+
+ size_t offset = sizeof(uint32_t) + sizeof(DDS_HEADER);
+ if ( convFlags & CONV_FLAGS_DX10 )
+ offset += sizeof(DDS_HEADER_DXT10);
+
+ assert( offset <= size );
+
+ const uint32_t *pal8 = nullptr;
+ if ( convFlags & CONV_FLAGS_PAL8 )
+ {
+ pal8 = reinterpret_cast<const uint32_t*>( reinterpret_cast<const uint8_t*>(pSource) + offset );
+ assert( pal8 );
+ offset += ( 256 * sizeof(uint32_t) );
+ if ( size < offset )
+ return E_FAIL;
+ }
+
+ hr = image.Initialize( mdata );
+ if ( FAILED(hr) )
+ return hr;
+
+ LPCVOID pPixels = reinterpret_cast<LPCVOID>( reinterpret_cast<const uint8_t*>(pSource) + offset );
+ assert( pPixels );
+ hr = _CopyImage( pPixels, size - offset, mdata,
+ (flags & DDS_FLAGS_LEGACY_DWORD) ? CP_FLAGS_LEGACY_DWORD : CP_FLAGS_NONE, convFlags, pal8, image );
+ if ( FAILED(hr) )
+ {
+ image.Release();
+ return hr;
+ }
+ if ( metadata )
+ memcpy( metadata, &mdata, sizeof(TexMetadata) );
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Load a DDS file from disk
+//-------------------------------------------------------------------------------------
+HRESULT LoadFromDDSFile( LPCWSTR szFile, DWORD flags, TexMetadata* metadata, ScratchImage& image )
+{
+ if ( !szFile )
+ return E_INVALIDARG;
+
+ image.Release();
+
+#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/)
+ ScopedHandle hFile( safe_handle ( CreateFile2( szFile, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, 0 ) ) );
+#else
+ ScopedHandle hFile( safe_handle ( CreateFileW( szFile, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING,
+ FILE_FLAG_SEQUENTIAL_SCAN, 0 ) ) );
+#endif
+
+ if ( !hFile )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ // Get the file size
+ LARGE_INTEGER fileSize = {0};
+
+#if (_WIN32_WINNT >= _WIN32_WINNT_VISTA)
+ FILE_STANDARD_INFO fileInfo;
+ if ( !GetFileInformationByHandleEx( hFile.get(), FileStandardInfo, &fileInfo, sizeof(fileInfo) ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+ fileSize = fileInfo.EndOfFile;
+#else
+ if ( !GetFileSizeEx( hFile.get(), &fileSize ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+#endif
+
+ // File is too big for 32-bit allocation, so reject read (4 GB should be plenty large enough for a valid DDS file)
+ if ( fileSize.HighPart > 0 )
+ {
+ return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE );
+ }
+
+ // Need at least enough data to fill the standard header and magic number to be a valid DDS
+ if ( fileSize.LowPart < ( sizeof(DDS_HEADER) + sizeof(uint32_t) ) )
+ {
+ return E_FAIL;
+ }
+
+ // Read the header in (including extended header if present)
+ const size_t MAX_HEADER_SIZE = sizeof(uint32_t) + sizeof(DDS_HEADER) + sizeof(DDS_HEADER_DXT10);
+ uint8_t header[MAX_HEADER_SIZE];
+
+ DWORD bytesRead = 0;
+ if ( !ReadFile( hFile.get(), header, MAX_HEADER_SIZE, &bytesRead, 0 ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ DWORD convFlags = 0;
+ TexMetadata mdata;
+ HRESULT hr = _DecodeDDSHeader( header, bytesRead, flags, mdata, &convFlags );
+ if ( FAILED(hr) )
+ return hr;
+
+ DWORD offset = MAX_HEADER_SIZE;
+
+ if ( !(convFlags & CONV_FLAGS_DX10) )
+ {
+ // Must reset file position since we read more than the standard header above
+ LARGE_INTEGER filePos = { sizeof(uint32_t) + sizeof(DDS_HEADER), 0};
+ if ( !SetFilePointerEx( hFile.get(), filePos, 0, FILE_BEGIN ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ offset = sizeof(uint32_t) + sizeof(DDS_HEADER);
+ }
+
+ std::unique_ptr<uint32_t[]> pal8;
+ if ( convFlags & CONV_FLAGS_PAL8 )
+ {
+ pal8.reset( new uint32_t[256] );
+ if ( !pal8 )
+ {
+ return E_OUTOFMEMORY;
+ }
+
+ if ( !ReadFile( hFile.get(), pal8.get(), 256 * sizeof(uint32_t), &bytesRead, 0 ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ if ( bytesRead != (256 * sizeof(uint32_t)) )
+ {
+ return E_FAIL;
+ }
+
+ offset += ( 256 * sizeof(uint32_t) );
+ }
+
+ DWORD remaining = fileSize.LowPart - offset;
+ if ( remaining == 0 )
+ return E_FAIL;
+
+ hr = image.Initialize( mdata );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( (convFlags & CONV_FLAGS_EXPAND) || (flags & DDS_FLAGS_LEGACY_DWORD) )
+ {
+ std::unique_ptr<uint8_t[]> temp( new uint8_t[ remaining ] );
+ if ( !temp )
+ {
+ image.Release();
+ return E_OUTOFMEMORY;
+ }
+
+ if ( !ReadFile( hFile.get(), temp.get(), remaining, &bytesRead, 0 ) )
+ {
+ image.Release();
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ if ( bytesRead != remaining )
+ {
+ image.Release();
+ return E_FAIL;
+ }
+
+ hr = _CopyImage( temp.get(), remaining, mdata,
+ (flags & DDS_FLAGS_LEGACY_DWORD) ? CP_FLAGS_LEGACY_DWORD : CP_FLAGS_NONE,
+ convFlags, pal8.get(), image );
+ if ( FAILED(hr) )
+ {
+ image.Release();
+ return hr;
+ }
+ }
+ else
+ {
+ if ( remaining > image.GetPixelsSize() )
+ {
+ image.Release();
+ return E_FAIL;
+ }
+
+ if ( !ReadFile( hFile.get(), image.GetPixels(), static_cast<DWORD>( image.GetPixelsSize() ), &bytesRead, 0 ) )
+ {
+ image.Release();
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ if ( convFlags & (CONV_FLAGS_SWIZZLE|CONV_FLAGS_NOALPHA) )
+ {
+ // Swizzle/copy image in place
+ hr = _CopyImageInPlace( convFlags, image );
+ if ( FAILED(hr) )
+ {
+ image.Release();
+ return hr;
+ }
+ }
+ }
+
+ if ( metadata )
+ memcpy( metadata, &mdata, sizeof(TexMetadata) );
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Save a DDS file to memory
+//-------------------------------------------------------------------------------------
+HRESULT SaveToDDSMemory( const Image* images, size_t nimages, const TexMetadata& metadata, DWORD flags, Blob& blob )
+{
+ if ( !images || (nimages == 0) )
+ return E_INVALIDARG;
+
+ // Determine memory required
+ size_t required = 0;
+ HRESULT hr = _EncodeDDSHeader( metadata, flags, 0, 0, required );
+ if ( FAILED(hr) )
+ return hr;
+
+ for( size_t i = 0; i < nimages; ++i )
+ {
+ required += images[ i ].slicePitch;
+ if ( !images[ i ].pixels )
+ return E_POINTER;
+ }
+
+ assert( required > 0 );
+
+ blob.Release();
+
+ hr = blob.Initialize( required );
+ if ( FAILED(hr) )
+ return hr;
+
+ uint8_t* pDestination = reinterpret_cast<uint8_t*>( blob.GetBufferPointer() );
+ assert( pDestination );
+
+ hr = _EncodeDDSHeader( metadata, flags, pDestination, blob.GetBufferSize(), required );
+ if ( FAILED(hr) )
+ {
+ blob.Release();
+ return hr;
+ }
+
+ size_t remaining = blob.GetBufferSize() - required;
+ pDestination += required;
+
+ if ( !remaining )
+ {
+ blob.Release();
+ return E_FAIL;
+ }
+
+ switch( metadata.dimension )
+ {
+ case DDS_DIMENSION_TEXTURE1D:
+ case DDS_DIMENSION_TEXTURE2D:
+ {
+ size_t index = 0;
+ for( size_t item = 0; item < metadata.arraySize; ++item )
+ {
+ for( size_t level = 0; level < metadata.mipLevels; ++level )
+ {
+ if ( index >= nimages )
+ {
+ blob.Release();
+ return E_FAIL;
+ }
+
+ size_t pixsize = images[ index ].slicePitch;
+ if ( memcpy_s( pDestination, remaining, images[ index ].pixels, pixsize ) )
+ {
+ blob.Release();
+ return E_FAIL;
+ }
+ pDestination += pixsize;
+ remaining -= pixsize;
+
+ ++index;
+ }
+ }
+ }
+ break;
+
+ case DDS_DIMENSION_TEXTURE3D:
+ {
+ if ( metadata.arraySize != 1 )
+ {
+ blob.Release();
+ return E_FAIL;
+ }
+
+ size_t d = metadata.depth;
+
+ size_t index = 0;
+ for( size_t level = 0; level < metadata.mipLevels; ++level )
+ {
+ for( size_t slice = 0; slice < d; ++slice )
+ {
+ if ( index >= nimages )
+ {
+ blob.Release();
+ return E_FAIL;
+ }
+
+ size_t pixsize = images[ index ].slicePitch;
+ if ( memcpy_s( pDestination, remaining, images[ index ].pixels, pixsize ) )
+ {
+ blob.Release();
+ return E_FAIL;
+ }
+ pDestination += pixsize;
+ remaining -= pixsize;
+
+ ++index;
+ }
+
+ if ( d > 1 )
+ d >>= 1;
+ }
+ }
+ break;
+
+ default:
+ blob.Release();
+ return E_FAIL;
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Save a DDS file to disk
+//-------------------------------------------------------------------------------------
+HRESULT SaveToDDSFile( const Image* images, size_t nimages, const TexMetadata& metadata, DWORD flags, LPCWSTR szFile )
+{
+ if ( !szFile )
+ return E_INVALIDARG;
+
+ // Create DDS Header
+ const size_t MAX_HEADER_SIZE = sizeof(uint32_t) + sizeof(DDS_HEADER) + sizeof(DDS_HEADER_DXT10);
+ uint8_t header[MAX_HEADER_SIZE];
+ size_t required;
+ HRESULT hr = _EncodeDDSHeader( metadata, flags, header, MAX_HEADER_SIZE, required );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Create file and write header
+#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/)
+ ScopedHandle hFile( safe_handle( CreateFile2( szFile, GENERIC_WRITE, 0, CREATE_ALWAYS, 0 ) ) );
+#else
+ ScopedHandle hFile( safe_handle( CreateFileW( szFile, GENERIC_WRITE, 0, 0, CREATE_ALWAYS, 0, 0 ) ) );
+#endif
+ if ( !hFile )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ DWORD bytesWritten;
+ if ( !WriteFile( hFile.get(), header, static_cast<DWORD>( required ), &bytesWritten, 0 ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ if ( bytesWritten != required )
+ {
+ return E_FAIL;
+ }
+
+ // Write images
+ switch( metadata.dimension )
+ {
+ case DDS_DIMENSION_TEXTURE1D:
+ case DDS_DIMENSION_TEXTURE2D:
+ {
+ size_t index = 0;
+ for( size_t item = 0; item < metadata.arraySize; ++item )
+ {
+ for( size_t level = 0; level < metadata.mipLevels; ++level, ++index )
+ {
+ if ( index >= nimages )
+ return E_FAIL;
+
+ if ( !images[ index ].pixels )
+ return E_POINTER;
+
+ size_t pixsize = images[ index ].slicePitch;
+
+ if ( !WriteFile( hFile.get(), images[ index ].pixels, static_cast<DWORD>( pixsize ), &bytesWritten, 0 ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ if ( bytesWritten != pixsize )
+ {
+ return E_FAIL;
+ }
+ }
+ }
+ }
+ break;
+
+ case DDS_DIMENSION_TEXTURE3D:
+ {
+ if ( metadata.arraySize != 1 )
+ return E_FAIL;
+
+ size_t d = metadata.depth;
+
+ size_t index = 0;
+ for( size_t level = 0; level < metadata.mipLevels; ++level )
+ {
+ for( size_t slice = 0; slice < d; ++slice, ++index )
+ {
+ if ( index >= nimages )
+ return E_FAIL;
+
+ if ( !images[ index ].pixels )
+ return E_POINTER;
+
+ size_t pixsize = images[ index ].slicePitch;
+
+ if ( !WriteFile( hFile.get(), images[ index ].pixels, static_cast<DWORD>( pixsize ), &bytesWritten, 0 ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ if ( bytesWritten != pixsize )
+ {
+ return E_FAIL;
+ }
+ }
+
+ if ( d > 1 )
+ d >>= 1;
+ }
+ }
+ break;
+
+ default:
+ return E_FAIL;
+ }
+
+ return S_OK;
+}
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexFlipRotate.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexFlipRotate.cpp
new file mode 100644
index 00000000..c90ea090
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTexFlipRotate.cpp
@@ -0,0 +1,327 @@
+//-------------------------------------------------------------------------------------
+// DirectXTexFlipRotate.cpp
+//
+// DirectX Texture Library - Image flip/rotate operations
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+namespace DirectX
+{
+
+//-------------------------------------------------------------------------------------
+// Do flip/rotate operation using WIC
+//-------------------------------------------------------------------------------------
+static HRESULT _PerformFlipRotateUsingWIC( _In_ const Image& srcImage, _In_ DWORD flags,
+ _In_ const WICPixelFormatGUID& pfGUID, _In_ const Image& destImage )
+{
+ if ( !srcImage.pixels || !destImage.pixels )
+ return E_POINTER;
+
+ assert( srcImage.format == destImage.format );
+
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ ScopedObject<IWICBitmap> source;
+ HRESULT hr = pWIC->CreateBitmapFromMemory( static_cast<UINT>( srcImage.width ), static_cast<UINT>( srcImage.height ), pfGUID,
+ static_cast<UINT>( srcImage.rowPitch ), static_cast<UINT>( srcImage.slicePitch ),
+ srcImage.pixels, &source );
+ if ( FAILED(hr) )
+ return hr;
+
+ ScopedObject<IWICBitmapFlipRotator> FR;
+ hr = pWIC->CreateBitmapFlipRotator( &FR );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FR->Initialize( source.Get(), static_cast<WICBitmapTransformOptions>( flags ) );
+ if ( FAILED(hr) )
+ return hr;
+
+ WICPixelFormatGUID pfFR;
+ hr = FR->GetPixelFormat( &pfFR );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( memcmp( &pfFR, &pfGUID, sizeof(GUID) ) != 0 )
+ {
+ // Flip/rotate should return the same format as the source...
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ UINT nwidth, nheight;
+ hr = FR->GetSize( &nwidth, &nheight );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( destImage.width != nwidth || destImage.height != nheight )
+ return E_FAIL;
+
+ hr = FR->CopyPixels( 0, static_cast<UINT>( destImage.rowPitch ), static_cast<UINT>( destImage.slicePitch ), destImage.pixels );
+ if ( FAILED(hr) )
+ return hr;
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Do conversion, flip/rotate using WIC, conversion cycle
+//-------------------------------------------------------------------------------------
+static HRESULT _PerformFlipRotateViaF32( _In_ const Image& srcImage, _In_ DWORD flags, _In_ const Image& destImage )
+{
+ if ( !srcImage.pixels || !destImage.pixels )
+ return E_POINTER;
+
+ assert( srcImage.format != DXGI_FORMAT_R32G32B32A32_FLOAT );
+ assert( srcImage.format == destImage.format );
+
+ ScratchImage temp;
+ HRESULT hr = _ConvertToR32G32B32A32( srcImage, temp );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *tsrc = temp.GetImage( 0, 0, 0 );
+ if ( !tsrc )
+ return E_POINTER;
+
+ ScratchImage rtemp;
+ hr = rtemp.Initialize2D( DXGI_FORMAT_R32G32B32A32_FLOAT, destImage.width, destImage.height, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *tdest = rtemp.GetImage( 0, 0, 0 );
+ if ( !tdest )
+ return E_POINTER;
+
+ hr = _PerformFlipRotateUsingWIC( *tsrc, flags, GUID_WICPixelFormat128bppRGBAFloat, *tdest );
+ if ( FAILED(hr) )
+ return hr;
+
+ temp.Release();
+
+ hr = _ConvertFromR32G32B32A32( *tdest, destImage );
+ if ( FAILED(hr) )
+ return hr;
+
+ return S_OK;
+}
+
+
+//=====================================================================================
+// Entry-points
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// Flip/rotate image
+//-------------------------------------------------------------------------------------
+HRESULT FlipRotate( const Image& srcImage, DWORD flags, ScratchImage& image )
+{
+ if ( !srcImage.pixels )
+ return E_POINTER;
+
+ if ( !flags )
+ return E_INVALIDARG;
+
+#ifdef _AMD64_
+ if ( (srcImage.width > 0xFFFFFFFF) || (srcImage.height > 0xFFFFFFFF) )
+ return E_INVALIDARG;
+#endif
+
+ if ( IsCompressed( srcImage.format ) )
+ {
+ // We don't support flip/rotate operations on compressed images
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ static_assert( TEX_FR_ROTATE0 == WICBitmapTransformRotate0, "TEX_FR_ROTATE0 no longer matches WIC" );
+ static_assert( TEX_FR_ROTATE90 == WICBitmapTransformRotate90, "TEX_FR_ROTATE90 no longer matches WIC" );
+ static_assert( TEX_FR_ROTATE180 == WICBitmapTransformRotate180, "TEX_FR_ROTATE180 no longer matches WIC" );
+ static_assert( TEX_FR_ROTATE270 == WICBitmapTransformRotate270, "TEX_FR_ROTATE270 no longer matches WIC" );
+ static_assert( TEX_FR_FLIP_HORIZONTAL == WICBitmapTransformFlipHorizontal, "TEX_FR_FLIP_HORIZONTAL no longer matches WIC" );
+ static_assert( TEX_FR_FLIP_VERTICAL == WICBitmapTransformFlipVertical, "TEX_FR_FLIP_VERTICAL no longer matches WIC" );
+
+ // Only supports 90, 180, 270, or no rotation flags... not a combination of rotation flags
+ switch ( flags & (TEX_FR_ROTATE90|TEX_FR_ROTATE180|TEX_FR_ROTATE270) )
+ {
+ case 0:
+ case TEX_FR_ROTATE90:
+ case TEX_FR_ROTATE180:
+ case TEX_FR_ROTATE270:
+ break;
+
+ default:
+ return E_INVALIDARG;
+ }
+
+ size_t nwidth = srcImage.width;
+ size_t nheight = srcImage.height;
+
+ if (flags & (TEX_FR_ROTATE90|TEX_FR_ROTATE270))
+ {
+ nwidth = srcImage.height;
+ nheight = srcImage.width;
+ }
+
+ HRESULT hr = image.Initialize2D( srcImage.format, nwidth, nheight, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *rimage = image.GetImage( 0, 0, 0 );
+ if ( !rimage )
+ return E_POINTER;
+
+ WICPixelFormatGUID pfGUID;
+ if ( _DXGIToWIC( srcImage.format, pfGUID ) )
+ {
+ // Case 1: Source format is supported by Windows Imaging Component
+ hr = _PerformFlipRotateUsingWIC( srcImage, flags, pfGUID, *rimage );
+ }
+ else
+ {
+ // Case 2: Source format is not supported by WIC, so we have to convert, flip/rotate, and convert back
+ hr = _PerformFlipRotateViaF32( srcImage, flags, *rimage );
+ }
+
+ if ( FAILED(hr) )
+ {
+ image.Release();
+ return hr;
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Flip/rotate image (complex)
+//-------------------------------------------------------------------------------------
+HRESULT FlipRotate( const Image* srcImages, size_t nimages, const TexMetadata& metadata,
+ DWORD flags, ScratchImage& result )
+{
+ if ( !srcImages || !nimages )
+ return E_INVALIDARG;
+
+ if ( IsCompressed( metadata.format ) )
+ {
+ // We don't support flip/rotate operations on compressed images
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ static_assert( TEX_FR_ROTATE0 == WICBitmapTransformRotate0, "TEX_FR_ROTATE0 no longer matches WIC" );
+ static_assert( TEX_FR_ROTATE90 == WICBitmapTransformRotate90, "TEX_FR_ROTATE90 no longer matches WIC" );
+ static_assert( TEX_FR_ROTATE180 == WICBitmapTransformRotate180, "TEX_FR_ROTATE180 no longer matches WIC" );
+ static_assert( TEX_FR_ROTATE270 == WICBitmapTransformRotate270, "TEX_FR_ROTATE270 no longer matches WIC" );
+ static_assert( TEX_FR_FLIP_HORIZONTAL == WICBitmapTransformFlipHorizontal, "TEX_FR_FLIP_HORIZONTAL no longer matches WIC" );
+ static_assert( TEX_FR_FLIP_VERTICAL == WICBitmapTransformFlipVertical, "TEX_FR_FLIP_VERTICAL no longer matches WIC" );
+
+ // Only supports 90, 180, 270, or no rotation flags... not a combination of rotation flags
+ switch ( flags & (TEX_FR_ROTATE90|TEX_FR_ROTATE180|TEX_FR_ROTATE270) )
+ {
+ case 0:
+ case TEX_FR_ROTATE90:
+ case TEX_FR_ROTATE180:
+ case TEX_FR_ROTATE270:
+ break;
+
+ default:
+ return E_INVALIDARG;
+ }
+
+ TexMetadata mdata2 = metadata;
+
+ bool flipwh = false;
+ if (flags & (TEX_FR_ROTATE90|TEX_FR_ROTATE270))
+ {
+ flipwh = true;
+ mdata2.width = metadata.height;
+ mdata2.height = metadata.width;
+ }
+
+ HRESULT hr = result.Initialize( mdata2 );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( nimages != result.GetImageCount() )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+
+ const Image* dest = result.GetImages();
+ if ( !dest )
+ {
+ result.Release();
+ return E_POINTER;
+ }
+
+ WICPixelFormatGUID pfGUID;
+ bool wicpf = _DXGIToWIC( metadata.format, pfGUID );
+
+ for( size_t index=0; index < nimages; ++index )
+ {
+ const Image& src = srcImages[ index ];
+ if ( src.format != metadata.format )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+
+#ifdef _AMD64_
+ if ( (src.width > 0xFFFFFFFF) || (src.height > 0xFFFFFFFF) )
+ return E_FAIL;
+#endif
+
+ const Image& dst = dest[ index ];
+ assert( dst.format == metadata.format );
+
+ if ( flipwh )
+ {
+ if ( src.width != dst.height || src.height != dst.width )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+ }
+ else
+ {
+ if ( src.width != dst.width || src.height != dst.height )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+ }
+
+ if (wicpf)
+ {
+ // Case 1: Source format is supported by Windows Imaging Component
+ hr = _PerformFlipRotateUsingWIC( src, flags, pfGUID, dst );
+ }
+ else
+ {
+ // Case 2: Source format is not supported by WIC, so we have to convert, flip/rotate, and convert back
+ hr = _PerformFlipRotateViaF32( src, flags, dst );
+ }
+
+ if ( FAILED(hr) )
+ {
+ result.Release();
+ return hr;
+ }
+ }
+
+ return S_OK;
+}
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexImage.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexImage.cpp
new file mode 100644
index 00000000..95d54267
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTexImage.cpp
@@ -0,0 +1,674 @@
+//-------------------------------------------------------------------------------------
+// DirectXTexImage.cpp
+//
+// DirectX Texture Library - Image container
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+namespace DirectX
+{
+
+extern bool _CalculateMipLevels( _In_ size_t width, _In_ size_t height, _Inout_ size_t& mipLevels );
+extern bool _CalculateMipLevels3D( _In_ size_t width, _In_ size_t height, _In_ size_t depth, _Inout_ size_t& mipLevels );
+
+//-------------------------------------------------------------------------------------
+// Determines number of image array entries and pixel size
+//-------------------------------------------------------------------------------------
+void _DetermineImageArray( const TexMetadata& metadata, DWORD cpFlags,
+ size_t& nImages, size_t& pixelSize )
+{
+ assert( metadata.width > 0 && metadata.height > 0 && metadata.depth > 0 );
+ assert( metadata.arraySize > 0 );
+ assert( metadata.mipLevels > 0 );
+
+ size_t _pixelSize = 0;
+ size_t _nimages = 0;
+
+ switch( metadata.dimension )
+ {
+ case TEX_DIMENSION_TEXTURE1D:
+ case TEX_DIMENSION_TEXTURE2D:
+ for( size_t item = 0; item < metadata.arraySize; ++item )
+ {
+ size_t w = metadata.width;
+ size_t h = metadata.height;
+
+ for( size_t level=0; level < metadata.mipLevels; ++level )
+ {
+ size_t rowPitch, slicePitch;
+ ComputePitch( metadata.format, w, h, rowPitch, slicePitch, cpFlags );
+
+ _pixelSize += slicePitch;
+ ++_nimages;
+
+ if ( h > 1 )
+ h >>= 1;
+
+ if ( w > 1 )
+ w >>= 1;
+ }
+ }
+ break;
+
+ case TEX_DIMENSION_TEXTURE3D:
+ {
+ size_t w = metadata.width;
+ size_t h = metadata.height;
+ size_t d = metadata.depth;
+
+ for( size_t level=0; level < metadata.mipLevels; ++level )
+ {
+ size_t rowPitch, slicePitch;
+ ComputePitch( metadata.format, w, h, rowPitch, slicePitch, cpFlags );
+
+ for( size_t slice=0; slice < d; ++slice )
+ {
+ _pixelSize += slicePitch;
+ ++_nimages;
+ }
+
+ if ( h > 1 )
+ h >>= 1;
+
+ if ( w > 1 )
+ w >>= 1;
+
+ if ( d > 1 )
+ d >>= 1;
+ }
+ }
+ break;
+
+ default:
+ assert( false );
+ break;
+ }
+
+ nImages = _nimages;
+ pixelSize = _pixelSize;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Fills in the image array entries
+//-------------------------------------------------------------------------------------
+bool _SetupImageArray( uint8_t *pMemory, size_t pixelSize,
+ const TexMetadata& metadata, DWORD cpFlags,
+ Image* images, size_t nImages )
+{
+ assert( pMemory );
+ assert( pixelSize > 0 );
+ assert( nImages > 0 );
+
+ if ( !images )
+ return false;
+
+ size_t index = 0;
+ uint8_t* pixels = pMemory;
+ const uint8_t* pEndBits = pMemory + pixelSize;
+
+ switch( metadata.dimension )
+ {
+ case TEX_DIMENSION_TEXTURE1D:
+ case TEX_DIMENSION_TEXTURE2D:
+ if (metadata.arraySize == 0 || metadata.mipLevels == 0)
+ {
+ return false;
+ }
+
+ for( size_t item = 0; item < metadata.arraySize; ++item )
+ {
+ size_t w = metadata.width;
+ size_t h = metadata.height;
+
+ for( size_t level=0; level < metadata.mipLevels; ++level )
+ {
+ if ( index >= nImages )
+ {
+ return false;
+ }
+
+ size_t rowPitch, slicePitch;
+ ComputePitch( metadata.format, w, h, rowPitch, slicePitch, cpFlags );
+
+ images[index].width = w;
+ images[index].height = h;
+ images[index].format = metadata.format;
+ images[index].rowPitch = rowPitch;
+ images[index].slicePitch = slicePitch;
+ images[index].pixels = pixels;
+ ++index;
+
+ pixels += slicePitch;
+ if ( pixels > pEndBits )
+ {
+ return false;
+ }
+
+ if ( h > 1 )
+ h >>= 1;
+
+ if ( w > 1 )
+ w >>= 1;
+ }
+ }
+ return true;
+
+ case TEX_DIMENSION_TEXTURE3D:
+ {
+ if (metadata.mipLevels == 0 || metadata.depth == 0)
+ {
+ return false;
+ }
+
+ size_t w = metadata.width;
+ size_t h = metadata.height;
+ size_t d = metadata.depth;
+
+ for( size_t level=0; level < metadata.mipLevels; ++level )
+ {
+ size_t rowPitch, slicePitch;
+ ComputePitch( metadata.format, w, h, rowPitch, slicePitch, cpFlags );
+
+ for( size_t slice=0; slice < d; ++slice )
+ {
+ if ( index >= nImages )
+ {
+ return false;
+ }
+
+ // We use the same memory organization that Direct3D 11 needs for D3D11_SUBRESOURCE_DATA
+ // with all slices of a given miplevel being continuous in memory
+ images[index].width = w;
+ images[index].height = h;
+ images[index].format = metadata.format;
+ images[index].rowPitch = rowPitch;
+ images[index].slicePitch = slicePitch;
+ images[index].pixels = pixels;
+ ++index;
+
+ pixels += slicePitch;
+ if ( pixels > pEndBits )
+ {
+ return false;
+ }
+ }
+
+ if ( h > 1 )
+ h >>= 1;
+
+ if ( w > 1 )
+ w >>= 1;
+
+ if ( d > 1 )
+ d >>= 1;
+ }
+ }
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+
+//=====================================================================================
+// ScratchImage - Bitmap image container
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// Methods
+//-------------------------------------------------------------------------------------
+HRESULT ScratchImage::Initialize( const TexMetadata& mdata )
+{
+ if ( !IsValid(mdata.format) || IsVideo(mdata.format) )
+ return E_INVALIDARG;
+
+ size_t mipLevels = mdata.mipLevels;
+
+ switch( mdata.dimension )
+ {
+ case TEX_DIMENSION_TEXTURE1D:
+ if ( !mdata.width || mdata.height != 1 || mdata.depth != 1 || !mdata.arraySize )
+ return E_INVALIDARG;
+
+ if ( !_CalculateMipLevels(mdata.width,1,mipLevels) )
+ return E_INVALIDARG;
+ break;
+
+ case TEX_DIMENSION_TEXTURE2D:
+ if ( !mdata.width || !mdata.height || mdata.depth != 1 || !mdata.arraySize )
+ return E_INVALIDARG;
+
+ if ( mdata.miscFlags & TEX_MISC_TEXTURECUBE )
+ {
+ if ( (mdata.arraySize % 6) != 0 )
+ return E_INVALIDARG;
+ }
+
+ if ( !_CalculateMipLevels(mdata.width,mdata.height,mipLevels) )
+ return E_INVALIDARG;
+ break;
+
+ case TEX_DIMENSION_TEXTURE3D:
+ if ( !mdata.width || !mdata.height || !mdata.depth || mdata.arraySize != 1 )
+ return E_INVALIDARG;
+
+ if ( !_CalculateMipLevels3D(mdata.width,mdata.height,mdata.depth,mipLevels) )
+ return E_INVALIDARG;
+ break;
+
+ default:
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ Release();
+
+ _metadata.width = mdata.width;
+ _metadata.height = mdata.height;
+ _metadata.depth = mdata.depth;
+ _metadata.arraySize = mdata.arraySize;
+ _metadata.mipLevels = mipLevels;
+ _metadata.miscFlags = mdata.miscFlags & TEX_MISC_TEXTURECUBE;
+ _metadata.format = mdata.format;
+ _metadata.dimension = mdata.dimension;
+
+ size_t pixelSize, nimages;
+ _DetermineImageArray( _metadata, CP_FLAGS_NONE, nimages, pixelSize );
+
+ _image = new Image[ nimages ];
+ if ( !_image )
+ return E_OUTOFMEMORY;
+
+ _nimages = nimages;
+ memset( _image, 0, sizeof(Image) * nimages );
+
+ _memory = reinterpret_cast<uint8_t*>( _aligned_malloc( pixelSize, 16 ) );
+ if ( !_memory )
+ {
+ Release();
+ return E_OUTOFMEMORY;
+ }
+ _size = pixelSize;
+ if ( !_SetupImageArray( _memory, pixelSize, _metadata, CP_FLAGS_NONE, _image, nimages ) )
+ {
+ Release();
+ return E_FAIL;
+ }
+
+ return S_OK;
+}
+
+HRESULT ScratchImage::Initialize1D( DXGI_FORMAT fmt, size_t length, size_t arraySize, size_t mipLevels )
+{
+ if ( !IsValid(fmt) || IsVideo(fmt) || !length || !arraySize )
+ return E_INVALIDARG;
+
+ // 1D is a special case of the 2D case
+ HRESULT hr = Initialize2D( fmt, length, 1, arraySize, mipLevels );
+ if ( FAILED(hr) )
+ return hr;
+
+ _metadata.dimension = TEX_DIMENSION_TEXTURE1D;
+
+ return S_OK;
+}
+
+HRESULT ScratchImage::Initialize2D( DXGI_FORMAT fmt, size_t width, size_t height, size_t arraySize, size_t mipLevels )
+{
+ if ( !IsValid(fmt) || IsVideo(fmt) || !width || !height || !arraySize )
+ return E_INVALIDARG;
+
+ if ( !_CalculateMipLevels(width,height,mipLevels) )
+ return E_INVALIDARG;
+
+ Release();
+
+ _metadata.width = width;
+ _metadata.height = height;
+ _metadata.depth = 1;
+ _metadata.arraySize = arraySize;
+ _metadata.mipLevels = mipLevels;
+ _metadata.miscFlags = 0;
+ _metadata.format = fmt;
+ _metadata.dimension = TEX_DIMENSION_TEXTURE2D;
+
+ size_t pixelSize, nimages;
+ _DetermineImageArray( _metadata, CP_FLAGS_NONE, nimages, pixelSize );
+
+ _image = new Image[ nimages ];
+ if ( !_image )
+ return E_OUTOFMEMORY;
+
+ _nimages = nimages;
+ memset( _image, 0, sizeof(Image) * nimages );
+
+ _memory = reinterpret_cast<uint8_t*>( _aligned_malloc( pixelSize, 16 ) );
+ if ( !_memory )
+ {
+ Release();
+ return E_OUTOFMEMORY;
+ }
+ _size = pixelSize;
+ if ( !_SetupImageArray( _memory, pixelSize, _metadata, CP_FLAGS_NONE, _image, nimages ) )
+ {
+ Release();
+ return E_FAIL;
+ }
+
+ return S_OK;
+}
+
+HRESULT ScratchImage::Initialize3D( DXGI_FORMAT fmt, size_t width, size_t height, size_t depth, size_t mipLevels )
+{
+ if ( !IsValid(fmt) || IsVideo(fmt) || !width || !height || !depth )
+ return E_INVALIDARG;
+
+ if ( !_CalculateMipLevels3D(width,height,depth,mipLevels) )
+ return E_INVALIDARG;
+
+ Release();
+
+ _metadata.width = width;
+ _metadata.height = height;
+ _metadata.depth = depth;
+ _metadata.arraySize = 1; // Direct3D 10.x/11 does not support arrays of 3D textures
+ _metadata.mipLevels = mipLevels;
+ _metadata.miscFlags = 0;
+ _metadata.format = fmt;
+ _metadata.dimension = TEX_DIMENSION_TEXTURE3D;
+
+ size_t pixelSize, nimages;
+ _DetermineImageArray( _metadata, CP_FLAGS_NONE, nimages, pixelSize );
+
+ _image = new Image[ nimages ];
+ if ( !_image )
+ {
+ Release();
+ return E_OUTOFMEMORY;
+ }
+ _nimages = nimages;
+ memset( _image, 0, sizeof(Image) * nimages );
+
+ _memory = reinterpret_cast<uint8_t*>( _aligned_malloc( pixelSize, 16 ) );
+ if ( !_memory )
+ {
+ Release();
+ return E_OUTOFMEMORY;
+ }
+ _size = pixelSize;
+
+ if ( !_SetupImageArray( _memory, pixelSize, _metadata, CP_FLAGS_NONE, _image, nimages ) )
+ {
+ Release();
+ return E_FAIL;
+ }
+
+ return S_OK;
+}
+
+HRESULT ScratchImage::InitializeCube( DXGI_FORMAT fmt, size_t width, size_t height, size_t nCubes, size_t mipLevels )
+{
+ if ( !IsValid(fmt) || IsVideo(fmt) || !width || !height || !nCubes )
+ return E_INVALIDARG;
+
+ // A DirectX11 cubemap is just a 2D texture array that is a multiple of 6 for each cube
+ HRESULT hr = Initialize2D( fmt, width, height, nCubes * 6, mipLevels );
+ if ( FAILED(hr) )
+ return hr;
+
+ _metadata.miscFlags |= TEX_MISC_TEXTURECUBE;
+
+ return S_OK;
+}
+
+HRESULT ScratchImage::InitializeFromImage( const Image& srcImage, bool allow1D )
+{
+ HRESULT hr = ( srcImage.height > 1 || !allow1D )
+ ? Initialize2D( srcImage.format, srcImage.width, srcImage.height, 1, 1 )
+ : Initialize1D( srcImage.format, srcImage.width, 1, 1 );
+
+ if ( FAILED(hr) )
+ return hr;
+
+ const uint8_t* sptr = reinterpret_cast<const uint8_t*>( srcImage.pixels );
+ if ( !sptr )
+ return E_POINTER;
+
+ uint8_t* dptr = reinterpret_cast<uint8_t*>( _image[0].pixels );
+ if ( !dptr )
+ return E_POINTER;
+
+ for( size_t y = 0; y < srcImage.height; ++y )
+ {
+ _CopyScanline( dptr, _image[0].rowPitch, sptr, srcImage.rowPitch, srcImage.format, TEXP_SCANLINE_NONE );
+ sptr += srcImage.rowPitch;
+ dptr += _image[0].rowPitch;
+ }
+
+ return S_OK;
+}
+
+HRESULT ScratchImage::InitializeArrayFromImages( const Image* images, size_t nImages, bool allow1D )
+{
+ if ( !images || !nImages )
+ return E_INVALIDARG;
+
+ DXGI_FORMAT format = images[0].format;
+ size_t width = images[0].width;
+ size_t height = images[0].height;
+
+ for( size_t index=0; index < nImages; ++index )
+ {
+ if ( !images[index].pixels )
+ return E_POINTER;
+
+ if ( images[index].format != format || images[index].width != width || images[index].height != height )
+ {
+ // All images must be the same format, width, and height
+ return E_FAIL;
+ }
+ }
+
+ HRESULT hr = ( height > 1 || !allow1D )
+ ? Initialize2D( format, width, height, nImages, 1 )
+ : Initialize1D( format, width, nImages, 1 );
+
+ if ( FAILED(hr) )
+ return hr;
+
+ for( size_t index=0; index < nImages; ++index )
+ {
+ const uint8_t* sptr = reinterpret_cast<const uint8_t*>( images[index].pixels );
+ if ( !sptr )
+ return E_POINTER;
+
+ assert( index < _nimages );
+ uint8_t* dptr = reinterpret_cast<uint8_t*>( _image[index].pixels );
+ if ( !dptr )
+ return E_POINTER;
+
+ for( size_t y = 0; y < height; ++y )
+ {
+ _CopyScanline( dptr, _image[index].rowPitch, sptr, images[index].rowPitch, format, TEXP_SCANLINE_NONE );
+ sptr += images[index].rowPitch;
+ dptr += _image[index].rowPitch;
+ }
+ }
+
+ return S_OK;
+}
+
+HRESULT ScratchImage::InitializeCubeFromImages( const Image* images, size_t nImages )
+{
+ if ( !images || !nImages )
+ return E_INVALIDARG;
+
+ // A DirectX11 cubemap is just a 2D texture array that is a multiple of 6 for each cube
+ if ( ( nImages % 6 ) != 0 )
+ return E_INVALIDARG;
+
+ HRESULT hr = InitializeArrayFromImages( images, nImages, false );
+ if ( FAILED(hr) )
+ return hr;
+
+ _metadata.miscFlags |= TEX_MISC_TEXTURECUBE;
+
+ return S_OK;
+}
+
+HRESULT ScratchImage::Initialize3DFromImages( const Image* images, size_t depth )
+{
+ if ( !images || !depth )
+ return E_INVALIDARG;
+
+ DXGI_FORMAT format = images[0].format;
+ size_t width = images[0].width;
+ size_t height = images[0].height;
+
+ for( size_t slice=0; slice < depth; ++slice )
+ {
+ if ( !images[slice].pixels )
+ return E_POINTER;
+
+ if ( images[slice].format != format || images[slice].width != width || images[slice].height != height )
+ {
+ // All images must be the same format, width, and height
+ return E_FAIL;
+ }
+ }
+
+ HRESULT hr = Initialize3D( format, width, height, depth, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ for( size_t slice=0; slice < depth; ++slice )
+ {
+ const uint8_t* sptr = reinterpret_cast<const uint8_t*>( images[slice].pixels );
+ if ( !sptr )
+ return E_POINTER;
+
+ assert( slice < _nimages );
+ uint8_t* dptr = reinterpret_cast<uint8_t*>( _image[slice].pixels );
+ if ( !dptr )
+ return E_POINTER;
+
+ for( size_t y = 0; y < height; ++y )
+ {
+ _CopyScanline( dptr, _image[slice].rowPitch, sptr, images[slice].rowPitch, format, TEXP_SCANLINE_NONE );
+ sptr += images[slice].rowPitch;
+ dptr += _image[slice].rowPitch;
+ }
+ }
+
+ return S_OK;
+}
+
+void ScratchImage::Release()
+{
+ _nimages = 0;
+ _size = 0;
+
+ if ( _image )
+ {
+ delete [] _image;
+ _image = 0;
+ }
+
+ if ( _memory )
+ {
+ _aligned_free( _memory );
+ _memory = 0;
+ }
+
+ memset(&_metadata, 0, sizeof(_metadata));
+}
+
+bool ScratchImage::OverrideFormat( DXGI_FORMAT f )
+{
+ if ( !_image )
+ return false;
+
+ if ( !IsValid( f ) || IsVideo( f ) )
+ return false;
+
+ if ( ( BitsPerPixel( f ) != BitsPerPixel( _metadata.format ) )
+ || ( IsCompressed( f ) != IsCompressed( _metadata.format ) )
+ || ( IsPacked( f ) != IsPacked( _metadata.format ) ) )
+ {
+ // Can't change the effective pitch of the format this way
+ return false;
+ }
+
+ for( size_t index = 0; index < _nimages; ++index )
+ {
+ _image[ index ].format = f;
+ }
+
+ _metadata.format = f;
+
+ return true;
+}
+
+const Image* ScratchImage::GetImage(size_t mip, size_t item, size_t slice) const
+{
+ if ( mip >= _metadata.mipLevels )
+ return nullptr;
+
+ size_t index = 0;
+
+ switch( _metadata.dimension )
+ {
+ case TEX_DIMENSION_TEXTURE1D:
+ case TEX_DIMENSION_TEXTURE2D:
+ if ( slice > 0 )
+ return nullptr;
+
+ if ( item >= _metadata.arraySize )
+ return nullptr;
+
+ index = item*( _metadata.mipLevels ) + mip;
+ break;
+
+ case TEX_DIMENSION_TEXTURE3D:
+ if ( item > 0 )
+ {
+ // No support for arrays of volumes
+ return nullptr;
+ }
+ else
+ {
+ size_t d = _metadata.depth;
+
+ for( size_t level = 0; level < mip; ++level )
+ {
+ index += d;
+ if ( d > 1 )
+ d >>= 1;
+ }
+
+ if ( slice >= d )
+ return nullptr;
+
+ index += slice;
+ }
+ break;
+
+ default:
+ return nullptr;
+ }
+
+ return &_image[index];
+}
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexMipmaps.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexMipmaps.cpp
new file mode 100644
index 00000000..1e7e27cd
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTexMipmaps.cpp
@@ -0,0 +1,1167 @@
+//-------------------------------------------------------------------------------------
+// DirectXTexMipMaps.cpp
+//
+// DirectX Texture Library - Mip-map generation
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+namespace DirectX
+{
+
+static const XMVECTORF32 s_boxScale = { 0.25f, 0.25f, 0.25f, 0.25f };
+static const XMVECTORF32 s_boxScale3D = { 0.125f, 0.125f, 0.125f, 0.125f };
+
+//-------------------------------------------------------------------------------------
+// Mipmap helper functions
+//-------------------------------------------------------------------------------------
+inline static bool ispow2( _In_ size_t x )
+{
+ return ((x != 0) && !(x & (x - 1)));
+}
+
+static size_t _CountMips( _In_ size_t width, _In_ size_t height)
+{
+ size_t mipLevels = 1;
+
+ while ( height > 1 || width > 1 )
+ {
+ if ( height > 1 )
+ height >>= 1;
+
+ if ( width > 1 )
+ width >>= 1;
+
+ ++mipLevels;
+ }
+
+ return mipLevels;
+}
+
+bool _CalculateMipLevels( _In_ size_t width, _In_ size_t height, _Inout_ size_t& mipLevels )
+{
+ if ( mipLevels > 1 )
+ {
+ size_t maxMips = _CountMips(width,height);
+ if ( mipLevels > maxMips )
+ return false;
+ }
+ else if ( mipLevels == 0 )
+ {
+ mipLevels = _CountMips(width,height);
+ }
+ else
+ {
+ mipLevels = 1;
+ }
+ return true;
+}
+
+static size_t _CountMips3D( _In_ size_t width, _In_ size_t height, _In_ size_t depth)
+{
+ size_t mipLevels = 1;
+
+ while ( height > 1 || width > 1 || depth > 1 )
+ {
+ if ( height > 1 )
+ height >>= 1;
+
+ if ( width > 1 )
+ width >>= 1;
+
+ if ( depth > 1 )
+ depth >>= 1;
+
+ ++mipLevels;
+ }
+
+ return mipLevels;
+}
+
+bool _CalculateMipLevels3D( _In_ size_t width, _In_ size_t height, _In_ size_t depth, _Inout_ size_t& mipLevels )
+{
+ if ( mipLevels > 1 )
+ {
+ if ( !ispow2(width) || !ispow2(height) || !ispow2(depth) )
+ return false;
+
+ size_t maxMips = _CountMips3D(width,height,depth);
+ if ( mipLevels > maxMips )
+ return false;
+ }
+ else if ( mipLevels == 0 && ispow2(width) && ispow2(height) && ispow2(depth) )
+ {
+ mipLevels = _CountMips3D(width,height,depth);
+ }
+ else
+ {
+ mipLevels = 1;
+ }
+ return true;
+}
+
+#ifndef __MINGW32__
+
+static HRESULT _EnsureWicBitmapPixelFormat( _In_ IWICImagingFactory* pWIC, _In_ IWICBitmap* src, _In_ DWORD filter,
+ _In_ const WICPixelFormatGUID& desiredPixelFormat,
+ _Deref_out_ IWICBitmap** dest )
+{
+ if ( !pWIC || !src || !dest )
+ return E_POINTER;
+
+ *dest = nullptr;
+
+ WICPixelFormatGUID actualPixelFormat;
+ HRESULT hr = src->GetPixelFormat( &actualPixelFormat );
+
+ if ( SUCCEEDED(hr) )
+ {
+ if ( memcmp( &actualPixelFormat, &desiredPixelFormat, sizeof(WICPixelFormatGUID) ) == 0 )
+ {
+ src->AddRef();
+ *dest = src;
+ }
+ else
+ {
+ ScopedObject<IWICFormatConverter> converter;
+ hr = pWIC->CreateFormatConverter( &converter );
+ if ( SUCCEEDED(hr) )
+ {
+ hr = converter->Initialize( src, desiredPixelFormat, _GetWICDither(filter), 0, 0, WICBitmapPaletteTypeCustom );
+ }
+
+ if ( SUCCEEDED(hr) )
+ {
+ hr = pWIC->CreateBitmapFromSource( converter.Get(), WICBitmapCacheOnDemand, dest );
+ }
+ }
+ }
+
+ return hr;
+}
+
+HRESULT _ResizeSeparateColorAndAlpha( _In_ IWICImagingFactory* pWIC, _In_ IWICBitmap* original,
+ _In_ size_t newWidth, _In_ size_t newHeight, _In_ DWORD filter, _Inout_ const Image* img )
+{
+ if ( !pWIC || !original || !img )
+ return E_POINTER;
+
+ const WICBitmapInterpolationMode interpolationMode = _GetWICInterp(filter);
+
+ WICPixelFormatGUID desiredPixelFormat = GUID_WICPixelFormatUndefined;
+ HRESULT hr = original->GetPixelFormat( &desiredPixelFormat );
+
+ size_t colorBytesInPixel = 0;
+ size_t colorBytesPerPixel = 0;
+ size_t colorWithAlphaBytesPerPixel = 0;
+ WICPixelFormatGUID colorPixelFormat = GUID_WICPixelFormatUndefined;
+ WICPixelFormatGUID colorWithAlphaPixelFormat = GUID_WICPixelFormatUndefined;
+
+ if ( SUCCEEDED(hr) )
+ {
+ ScopedObject<IWICComponentInfo> componentInfo;
+ hr = pWIC->CreateComponentInfo( desiredPixelFormat, &componentInfo );
+
+ ScopedObject<IWICPixelFormatInfo> pixelFormatInfo;
+ if ( SUCCEEDED(hr) )
+ {
+ hr = componentInfo->QueryInterface( __uuidof(IWICPixelFormatInfo), (void**)&pixelFormatInfo );
+ }
+
+ UINT bitsPerPixel = 0;
+ if ( SUCCEEDED(hr) )
+ {
+ hr = pixelFormatInfo->GetBitsPerPixel( &bitsPerPixel );
+ }
+
+ if ( SUCCEEDED(hr) )
+ {
+ if ( bitsPerPixel <= 32 )
+ {
+ colorBytesInPixel = colorBytesPerPixel = 3;
+ colorPixelFormat = GUID_WICPixelFormat24bppBGR;
+
+ colorWithAlphaBytesPerPixel = 4;
+ colorWithAlphaPixelFormat = GUID_WICPixelFormat32bppBGRA;
+ }
+ else
+ {
+#if(_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) || defined(_WIN7_PLATFORM_UPDATE)
+ if ( _IsWIC2() )
+ {
+ colorBytesInPixel = colorBytesPerPixel = 12;
+ colorPixelFormat = GUID_WICPixelFormat96bppRGBFloat;
+ }
+ else
+#endif
+ {
+ colorBytesInPixel = 12;
+ colorBytesPerPixel = 16;
+ colorPixelFormat = GUID_WICPixelFormat128bppRGBFloat;
+ }
+
+ colorWithAlphaBytesPerPixel = 16;
+ colorWithAlphaPixelFormat = GUID_WICPixelFormat128bppRGBAFloat;
+ }
+ }
+ }
+
+ // Resize color only image (no alpha channel)
+ ScopedObject<IWICBitmap> resizedColor;
+ if ( SUCCEEDED(hr) )
+ {
+ ScopedObject<IWICBitmapScaler> colorScaler;
+
+ hr = pWIC->CreateBitmapScaler(&colorScaler);
+ if ( SUCCEEDED(hr) )
+ {
+ ScopedObject<IWICBitmap> converted;
+
+ hr = _EnsureWicBitmapPixelFormat( pWIC, original, filter, colorPixelFormat, &converted );
+ if ( SUCCEEDED(hr) )
+ {
+ hr = colorScaler->Initialize( converted.Get(), static_cast<UINT>(newWidth), static_cast<UINT>(newHeight), interpolationMode );
+ }
+ }
+
+ if ( SUCCEEDED(hr) )
+ {
+ ScopedObject<IWICBitmap> resized;
+
+ hr = pWIC->CreateBitmapFromSource( colorScaler.Get(), WICBitmapCacheOnDemand, &resized );
+ if ( SUCCEEDED(hr) )
+ {
+ hr = _EnsureWicBitmapPixelFormat( pWIC, resized.Get(), filter, colorPixelFormat, &resizedColor );
+ }
+ }
+ }
+
+ // Resize color+alpha image
+ ScopedObject<IWICBitmap> resizedColorWithAlpha;
+ if ( SUCCEEDED(hr) )
+ {
+ ScopedObject<IWICBitmapScaler> colorWithAlphaScaler;
+
+ hr = pWIC->CreateBitmapScaler( &colorWithAlphaScaler );
+ if ( SUCCEEDED(hr) )
+ {
+ ScopedObject<IWICBitmap> converted;
+
+ hr = _EnsureWicBitmapPixelFormat( pWIC, original, filter, colorWithAlphaPixelFormat, &converted );
+ if ( SUCCEEDED(hr) )
+ {
+ hr = colorWithAlphaScaler->Initialize( converted.Get(), static_cast<UINT>(newWidth), static_cast<UINT>(newHeight), interpolationMode );
+ }
+ }
+
+ if ( SUCCEEDED(hr) )
+ {
+ ScopedObject<IWICBitmap> resized;
+
+ hr = pWIC->CreateBitmapFromSource( colorWithAlphaScaler.Get(), WICBitmapCacheOnDemand, &resized );
+ if ( SUCCEEDED(hr) )
+ {
+ hr = _EnsureWicBitmapPixelFormat( pWIC, resized.Get(), filter, colorWithAlphaPixelFormat, &resizedColorWithAlpha );
+ }
+ }
+ }
+
+ // Merge pixels (copying color channels from color only image to color+alpha image)
+ if ( SUCCEEDED(hr) )
+ {
+ ScopedObject<IWICBitmapLock> colorLock;
+ ScopedObject<IWICBitmapLock> colorWithAlphaLock;
+
+ hr = resizedColor->Lock( nullptr, WICBitmapLockRead, &colorLock );
+ if ( SUCCEEDED(hr) )
+ {
+ hr = resizedColorWithAlpha->Lock( nullptr, WICBitmapLockWrite, &colorWithAlphaLock );
+ }
+
+ if ( SUCCEEDED(hr) )
+ {
+ WICInProcPointer colorWithAlphaData = nullptr;
+ UINT colorWithAlphaSizeInBytes = 0;
+ UINT colorWithAlphaStride = 0;
+
+ hr = colorWithAlphaLock->GetDataPointer( &colorWithAlphaSizeInBytes, &colorWithAlphaData );
+ if ( SUCCEEDED(hr) )
+ {
+ if ( !colorWithAlphaData )
+ {
+ hr = E_POINTER;
+ }
+ else
+ {
+ hr = colorWithAlphaLock->GetStride( &colorWithAlphaStride );
+ }
+ }
+
+ WICInProcPointer colorData = nullptr;
+ UINT colorSizeInBytes = 0;
+ UINT colorStride = 0;
+ if ( SUCCEEDED(hr) )
+ {
+ hr = colorLock->GetDataPointer( &colorSizeInBytes, &colorData );
+ if ( SUCCEEDED(hr) )
+ {
+ if ( !colorData )
+ {
+ hr = E_POINTER;
+ }
+ else
+ {
+ hr = colorLock->GetStride( &colorStride );
+ }
+ }
+ }
+
+ for ( size_t j = 0; SUCCEEDED(hr) && j < newHeight; j++ )
+ {
+ for ( size_t i = 0; SUCCEEDED(hr) && i < newWidth; i++ )
+ {
+ size_t colorWithAlphaIndex = (j * colorWithAlphaStride) + (i * colorWithAlphaBytesPerPixel);
+ size_t colorIndex = (j * colorStride) + (i * colorBytesPerPixel);
+
+ if ( ((colorWithAlphaIndex + colorBytesInPixel) > colorWithAlphaSizeInBytes)
+ || ( (colorIndex + colorBytesPerPixel) > colorSizeInBytes) )
+ {
+ hr = E_INVALIDARG;
+ }
+ else
+ {
+ memcpy_s( colorWithAlphaData + colorWithAlphaIndex, colorWithAlphaBytesPerPixel, colorData + colorIndex, colorBytesInPixel );
+ }
+ }
+ }
+ }
+ }
+
+ if ( SUCCEEDED(hr) )
+ {
+ ScopedObject<IWICBitmap> wicBitmap;
+ hr = _EnsureWicBitmapPixelFormat( pWIC, resizedColorWithAlpha.Get(), filter, desiredPixelFormat, &wicBitmap );
+ if ( SUCCEEDED(hr) )
+ {
+ hr = wicBitmap->CopyPixels( nullptr, static_cast<UINT>(img->rowPitch), static_cast<UINT>(img->slicePitch), img->pixels );
+ }
+ }
+
+ return hr;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Generate a (2D) mip-map chain from a base image using WIC's image scaler
+//-------------------------------------------------------------------------------------
+static HRESULT _GenerateMipMapsUsingWIC( _In_ const Image& baseImage, _In_ DWORD filter, _In_ size_t levels,
+ _In_ const WICPixelFormatGUID& pfGUID, _In_ const ScratchImage& mipChain, _In_ size_t item )
+{
+ assert( levels > 1 );
+
+ if ( !baseImage.pixels || !mipChain.GetPixels() )
+ return E_POINTER;
+
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ size_t width = baseImage.width;
+ size_t height = baseImage.height;
+
+ ScopedObject<IWICBitmap> source;
+ HRESULT hr = pWIC->CreateBitmapFromMemory( static_cast<UINT>( width ), static_cast<UINT>( height ), pfGUID,
+ static_cast<UINT>( baseImage.rowPitch ), static_cast<UINT>( baseImage.slicePitch ),
+ baseImage.pixels, &source );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Copy base image to top miplevel
+ const Image *img0 = mipChain.GetImage( 0, item, 0 );
+ if ( !img0 )
+ return E_POINTER;
+
+ uint8_t* pDest = img0->pixels;
+ if ( !pDest )
+ return E_POINTER;
+
+ const uint8_t *pSrc = baseImage.pixels;
+ for( size_t h=0; h < height; ++h )
+ {
+ size_t msize = std::min<size_t>( img0->rowPitch, baseImage.rowPitch );
+ memcpy_s( pDest, img0->rowPitch, pSrc, msize );
+ pSrc += baseImage.rowPitch;
+ pDest += img0->rowPitch;
+ }
+
+ ScopedObject<IWICComponentInfo> componentInfo;
+ hr = pWIC->CreateComponentInfo( pfGUID, &componentInfo );
+ if ( FAILED(hr) )
+ return hr;
+
+ ScopedObject<IWICPixelFormatInfo2> pixelFormatInfo;
+ hr = componentInfo->QueryInterface( __uuidof(IWICPixelFormatInfo2), (void**)&pixelFormatInfo );
+ if ( FAILED(hr) )
+ return hr;
+
+ BOOL supportsTransparency = FALSE;
+ hr = pixelFormatInfo->SupportsTransparency( &supportsTransparency );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Resize base image to each target mip level
+ for( size_t level = 1; level < levels; ++level )
+ {
+ const Image *img = mipChain.GetImage( level, item, 0 );
+ if ( !img )
+ return E_POINTER;
+
+ if ( height > 1 )
+ height >>= 1;
+
+ if ( width > 1 )
+ width >>= 1;
+
+ assert( img->width == width && img->height == height && img->format == baseImage.format );
+
+ if ( (filter & TEX_FILTER_SEPARATE_ALPHA) && supportsTransparency )
+ {
+ hr = _ResizeSeparateColorAndAlpha( pWIC, source.Get(), width, height, filter, img );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ else
+ {
+ ScopedObject<IWICBitmapScaler> scaler;
+ hr = pWIC->CreateBitmapScaler( &scaler );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = scaler->Initialize( source.Get(), static_cast<UINT>( width ), static_cast<UINT>( height ), _GetWICInterp( filter ) );
+ if ( FAILED(hr) )
+ return hr;
+
+ WICPixelFormatGUID pfScaler;
+ hr = scaler->GetPixelFormat( &pfScaler );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( memcmp( &pfScaler, &pfGUID, sizeof(WICPixelFormatGUID) ) == 0 )
+ {
+ hr = scaler->CopyPixels( 0, static_cast<UINT>( img->rowPitch ), static_cast<UINT>( img->slicePitch ), img->pixels );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ else
+ {
+ // The WIC bitmap scaler is free to return a different pixel format than the source image, so here we
+ // convert it back
+ ScopedObject<IWICFormatConverter> FC;
+ hr = pWIC->CreateFormatConverter( &FC );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FC->Initialize( scaler.Get(), pfGUID, _GetWICDither( filter ), 0, 0, WICBitmapPaletteTypeCustom );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FC->CopyPixels( 0, static_cast<UINT>( img->rowPitch ), static_cast<UINT>( img->slicePitch ), img->pixels );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ }
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Generate volume mip-map helpers
+//-------------------------------------------------------------------------------------
+static HRESULT _Setup3DMips( _In_count_(depth) const Image* baseImages, _In_ size_t depth, size_t levels,
+ _Out_ ScratchImage& mipChain )
+{
+ if ( !baseImages || !depth )
+ return E_INVALIDARG;
+
+ assert( levels > 1 );
+
+ size_t width = baseImages[0].width;
+ size_t height = baseImages[0].height;
+
+ HRESULT hr = mipChain.Initialize3D( baseImages[0].format, width, height, depth, levels );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Copy base images to top slice
+ for( size_t slice=0; slice < depth; ++slice )
+ {
+ const Image& src = baseImages[slice];
+
+ const Image *dest = mipChain.GetImage( 0, 0, slice );
+ if ( !dest )
+ {
+ mipChain.Release();
+ return E_POINTER;
+ }
+
+ assert( src.format == dest->format );
+
+ uint8_t* pDest = dest->pixels;
+ if ( !pDest )
+ {
+ mipChain.Release();
+ return E_POINTER;
+ }
+
+ const uint8_t *pSrc = src.pixels;
+ size_t rowPitch = src.rowPitch;
+ for( size_t h=0; h < height; ++h )
+ {
+ size_t msize = std::min<size_t>( dest->rowPitch, rowPitch );
+ memcpy_s( pDest, dest->rowPitch, pSrc, msize );
+ pSrc += rowPitch;
+ pDest += dest->rowPitch;
+ }
+ }
+
+ return S_OK;
+}
+
+static HRESULT _Generate3DMipsPointFilter( _In_ size_t depth, _In_ size_t levels, _In_ const ScratchImage& mipChain )
+{
+ if ( !depth || !mipChain.GetImages() )
+ return E_INVALIDARG;
+
+ // This assumes that the base images are already placed into the mipChain at the top level... (see _Setup3DMips)
+
+ assert( levels > 1 );
+
+ size_t width = mipChain.GetMetadata().width;
+ size_t height = mipChain.GetMetadata().height;
+
+ assert( ispow2(width) && ispow2(height) && ispow2(depth) );
+
+ // Allocate temporary space (2 scanlines)
+ ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast<XMVECTOR*>( _aligned_malloc( (sizeof(XMVECTOR)*width*2), 16 ) ) );
+ if ( !scanline )
+ return E_OUTOFMEMORY;
+
+ XMVECTOR* target = scanline.get();
+
+ XMVECTOR* row = target + width;
+
+ // Resize base image to each target mip level
+ for( size_t level=1; level < levels; ++level )
+ {
+ if ( depth > 1 )
+ {
+ // 3D point filter
+ for( size_t slice=0; slice < depth; slice += 2 )
+ {
+ const Image* src = mipChain.GetImage( level-1, 0, slice );
+ const Image* dest = mipChain.GetImage( level, 0, slice >> 1 );
+
+ if ( !src || !dest )
+ return E_POINTER;
+
+ const uint8_t* pSrc = src->pixels;
+ uint8_t* pDest = dest->pixels;
+
+ size_t rowPitch = src->rowPitch;
+
+ size_t nheight = height >> 1;
+
+ for( size_t y = 0; y < nheight; ++y )
+ {
+ if ( !_LoadScanline( row, width, pSrc, rowPitch, src->format ) )
+ return E_FAIL;
+ pSrc += rowPitch*2;
+
+ size_t nwidth = width >> 1;
+
+ for( size_t x = 0; x < nwidth; ++x )
+ {
+ target[ x ] = row[ x*2 ];
+ }
+
+ if ( !_StoreScanline( pDest, dest->rowPitch, dest->format, target, nwidth ) )
+ return E_FAIL;
+ pDest += dest->rowPitch;
+ }
+ }
+ }
+ else
+ {
+ // 2D point filter
+ const Image* src = mipChain.GetImage( level-1, 0, 0 );
+ const Image* dest = mipChain.GetImage( level, 0, 0 );
+
+ if ( !src || !dest )
+ return E_POINTER;
+
+ const uint8_t* pSrc = src->pixels;
+ uint8_t* pDest = dest->pixels;
+
+ size_t rowPitch = src->rowPitch;
+
+ size_t nheight = height >> 1;
+
+ for( size_t y = 0; y < nheight; ++y )
+ {
+ if ( !_LoadScanline( row, width, pSrc, rowPitch, src->format ) )
+ return E_FAIL;
+ pSrc += rowPitch*2;
+
+ size_t nwidth = width >> 1;
+
+ for( size_t x = 0; x < nwidth; ++x )
+ {
+ target[ x ] = row[ x*2 ];
+ }
+
+ if ( !_StoreScanline( pDest, dest->rowPitch, dest->format, target, nwidth ) )
+ return E_FAIL;
+ pDest += dest->rowPitch;
+ }
+ }
+
+ if ( height > 1 )
+ height >>= 1;
+
+ if ( width > 1 )
+ width >>= 1;
+
+ if ( depth > 1 )
+ depth >>= 1;
+ }
+
+ assert( height == 1 && width == 1 && depth == 1 );
+
+ return S_OK;
+}
+
+static HRESULT _Generate3DMipsBoxFilter( _In_ size_t depth, _In_ size_t levels, _In_ const ScratchImage& mipChain )
+{
+ if ( !depth || !mipChain.GetImages() )
+ return E_INVALIDARG;
+
+ // This assumes that the base images are already placed into the mipChain at the top level... (see _Setup3DMips)
+
+ assert( levels > 1 );
+
+ size_t width = mipChain.GetMetadata().width;
+ size_t height = mipChain.GetMetadata().height;
+
+ assert( ispow2(width) && ispow2(height) && ispow2(depth) );
+
+ // Allocate temporary space (5 scanlines)
+ ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast<XMVECTOR*>( _aligned_malloc( (sizeof(XMVECTOR)*width*5), 16 ) ) );
+ if ( !scanline )
+ return E_OUTOFMEMORY;
+
+ XMVECTOR* target = scanline.get();
+
+ XMVECTOR* urow0 = target + width;
+ XMVECTOR* urow1 = target + width*2;
+ XMVECTOR* vrow0 = target + width*3;
+ XMVECTOR* vrow1 = target + width*4;
+
+ const XMVECTOR* urow2 = urow0 + 1;
+ const XMVECTOR* urow3 = urow1 + 1;
+ const XMVECTOR* vrow2 = vrow0 + 1;
+ const XMVECTOR* vrow3 = vrow1 + 1;
+
+ // Resize base image to each target mip level
+ for( size_t level=1; level < levels; ++level )
+ {
+ if ( height == 1)
+ {
+ urow0 = vrow0;
+ urow1 = vrow1;
+ }
+
+ if ( width == 1 )
+ {
+ urow2 = urow0;
+ urow3 = urow1;
+ vrow2 = vrow0;
+ vrow3 = vrow1;
+ }
+
+ if ( depth > 1 )
+ {
+ // 3D box filter
+ for( size_t slice=0; slice < depth; slice += 2 )
+ {
+ const Image* srca = mipChain.GetImage( level-1, 0, slice );
+ const Image* srcb = mipChain.GetImage( level-1, 0, slice+1 );
+ const Image* dest = mipChain.GetImage( level, 0, slice >> 1 );
+
+ if ( !srca || !srcb || !dest )
+ return E_POINTER;
+
+ const uint8_t* pSrc1 = srca->pixels;
+ const uint8_t* pSrc2 = srcb->pixels;
+ uint8_t* pDest = dest->pixels;
+
+ size_t aRowPitch = srca->rowPitch;
+ size_t bRowPitch = srcb->rowPitch;
+
+ size_t nheight = height >> 1;
+
+ for( size_t y = 0; y < nheight; ++y )
+ {
+ if ( !_LoadScanline( urow0, width, pSrc1, aRowPitch, srca->format ) )
+ return E_FAIL;
+ pSrc1 += aRowPitch;
+
+ if ( urow0 != urow1 )
+ {
+ if ( !_LoadScanline( urow1, width, pSrc1, aRowPitch, srca->format ) )
+ return E_FAIL;
+ pSrc1 += aRowPitch;
+ }
+
+ if ( urow0 != vrow0 )
+ {
+ if ( !_LoadScanline( vrow0, width, pSrc2, bRowPitch, srcb->format ) )
+ return E_FAIL;
+ pSrc2 += bRowPitch;
+ }
+
+ if ( urow0 != vrow1 && vrow0 != vrow1 )
+ {
+ if ( !_LoadScanline( vrow1, width, pSrc2, bRowPitch, srcb->format ) )
+ return E_FAIL;
+ pSrc2 += bRowPitch;
+ }
+
+ size_t nwidth = width >> 1;
+
+ for( size_t x = 0; x < nwidth; ++x )
+ {
+ size_t x2 = x*2;
+
+ // Box filter: Average 2x2x2 pixels
+ XMVECTOR v = XMVectorAdd( urow0[ x2 ], urow1[ x2 ] );
+ v = XMVectorAdd( v, urow2[ x2 ] );
+ v = XMVectorAdd( v, urow3[ x2 ] );
+ v = XMVectorAdd( v, vrow0[ x2 ] );
+ v = XMVectorAdd( v, vrow1[ x2 ] );
+ v = XMVectorAdd( v, vrow2[ x2 ] );
+ v = XMVectorAdd( v, vrow3[ x2 ] );
+
+ target[ x ] = XMVectorMultiply( v, s_boxScale3D );
+ }
+
+ if ( !_StoreScanline( pDest, dest->rowPitch, dest->format, target, nwidth ) )
+ return E_FAIL;
+ pDest += dest->rowPitch;
+ }
+ }
+ }
+ else
+ {
+ // 2D box filter
+ const Image* src = mipChain.GetImage( level-1, 0, 0 );
+ const Image* dest = mipChain.GetImage( level, 0, 0 );
+
+ if ( !src || !dest )
+ return E_POINTER;
+
+ const uint8_t* pSrc = src->pixels;
+ uint8_t* pDest = dest->pixels;
+
+ size_t rowPitch = src->rowPitch;
+
+ size_t nheight = height >> 1;
+
+ for( size_t y = 0; y < nheight; ++y )
+ {
+ if ( !_LoadScanline( urow0, width, pSrc, rowPitch, src->format ) )
+ return E_FAIL;
+ pSrc += rowPitch;
+
+ if ( urow0 != urow1 )
+ {
+ if ( !_LoadScanline( urow1, width, pSrc, rowPitch, src->format ) )
+ return E_FAIL;
+ pSrc += rowPitch;
+ }
+
+ size_t nwidth = width >> 1;
+
+ for( size_t x = 0; x < nwidth; ++x )
+ {
+ size_t x2 = x*2;
+
+ // Box filter: Average 2x2 pixels
+ XMVECTOR v = XMVectorAdd( urow0[ x2 ], urow1[ x2 ] );
+ v = XMVectorAdd( v, urow2[ x2 ] );
+ v = XMVectorAdd( v, urow3[ x2 ] );
+
+ target[ x ] = XMVectorMultiply( v, s_boxScale );
+ }
+
+ if ( !_StoreScanline( pDest, dest->rowPitch, dest->format, target, nwidth ) )
+ return E_FAIL;
+ pDest += dest->rowPitch;
+ }
+ }
+
+ if ( height > 1 )
+ height >>= 1;
+
+ if ( width > 1 )
+ width >>= 1;
+
+ if ( depth > 1 )
+ depth >>= 1;
+ }
+
+ assert( height == 1 && width == 1 && depth == 1 );
+
+ return S_OK;
+}
+
+
+//=====================================================================================
+// Entry-points
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// Generate mipmap chain
+//-------------------------------------------------------------------------------------
+HRESULT GenerateMipMaps( const Image& baseImage, DWORD filter, size_t levels, ScratchImage& mipChain, bool allow1D )
+{
+ if ( !IsValid( baseImage.format ) )
+ return E_INVALIDARG;
+
+ if ( !baseImage.pixels )
+ return E_POINTER;
+
+ if ( !_CalculateMipLevels(baseImage.width, baseImage.height, levels) )
+ return E_INVALIDARG;
+
+ if ( IsCompressed( baseImage.format ) || IsVideo( baseImage.format ) )
+ {
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ static_assert( TEX_FILTER_POINT == 0x100000, "TEX_FILTER_ flag values don't match TEX_FILTER_MASK" );
+ switch(filter & TEX_FILTER_MASK)
+ {
+ case 0:
+ case TEX_FILTER_POINT:
+ case TEX_FILTER_FANT: // Equivalent to Box filter
+ case TEX_FILTER_LINEAR:
+ case TEX_FILTER_CUBIC:
+ {
+ WICPixelFormatGUID pfGUID;
+ if ( _DXGIToWIC( baseImage.format, pfGUID ) )
+ {
+ // Case 1: Base image format is supported by Windows Imaging Component
+ HRESULT hr = (baseImage.height > 1 || !allow1D)
+ ? mipChain.Initialize2D( baseImage.format, baseImage.width, baseImage.height, 1, levels )
+ : mipChain.Initialize1D( baseImage.format, baseImage.width, 1, levels );
+ if ( FAILED(hr) )
+ return hr;
+
+ return _GenerateMipMapsUsingWIC( baseImage, filter, levels, pfGUID, mipChain, 0 );
+ }
+ else
+ {
+ // Case 2: Base image format is not supported by WIC, so we have to convert, generate, and convert back
+ assert( baseImage.format != DXGI_FORMAT_R32G32B32A32_FLOAT );
+ ScratchImage temp;
+ HRESULT hr = _ConvertToR32G32B32A32( baseImage, temp );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *timg = temp.GetImage( 0, 0, 0 );
+ if ( !timg )
+ return E_POINTER;
+
+ ScratchImage tMipChain;
+ hr = _GenerateMipMapsUsingWIC( *timg, filter, levels, GUID_WICPixelFormat128bppRGBAFloat, tMipChain, 0 );
+ if ( FAILED(hr) )
+ return hr;
+
+ temp.Release();
+
+ return _ConvertFromR32G32B32A32( tMipChain.GetImages(), tMipChain.GetImageCount(), tMipChain.GetMetadata(), baseImage.format, mipChain );
+ }
+ }
+ break;
+
+ default:
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+}
+
+HRESULT GenerateMipMaps( const Image* srcImages, size_t nimages, const TexMetadata& metadata,
+ DWORD filter, size_t levels, ScratchImage& mipChain )
+{
+ if ( !srcImages || !nimages || !IsValid(metadata.format) )
+ return E_INVALIDARG;
+
+ if ( metadata.dimension == TEX_DIMENSION_TEXTURE3D
+ || IsCompressed( metadata.format ) || IsVideo( metadata.format ) )
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+
+ if ( !_CalculateMipLevels(metadata.width, metadata.height, levels) )
+ return E_INVALIDARG;
+
+ static_assert( TEX_FILTER_POINT == 0x100000, "TEX_FILTER_ flag values don't match TEX_FILTER_MASK" );
+ switch(filter & TEX_FILTER_MASK)
+ {
+ case 0:
+ case TEX_FILTER_POINT:
+ case TEX_FILTER_FANT: // Equivalent to Box filter
+ case TEX_FILTER_LINEAR:
+ case TEX_FILTER_CUBIC:
+ {
+ WICPixelFormatGUID pfGUID;
+ if ( _DXGIToWIC( metadata.format, pfGUID ) )
+ {
+ // Case 1: Base image format is supported by Windows Imaging Component
+ TexMetadata mdata2 = metadata;
+ mdata2.mipLevels = levels;
+ HRESULT hr = mipChain.Initialize( mdata2 );
+ if ( FAILED(hr) )
+ return hr;
+
+ for( size_t item = 0; item < metadata.arraySize; ++item )
+ {
+ size_t index = metadata.ComputeIndex( 0, item, 0 );
+ if ( index >= nimages )
+ {
+ mipChain.Release();
+ return E_FAIL;
+ }
+
+ const Image& baseImage = srcImages[ index ];
+
+ hr = _GenerateMipMapsUsingWIC( baseImage, filter, levels, pfGUID, mipChain, item );
+ if ( FAILED(hr) )
+ {
+ mipChain.Release();
+ return hr;
+ }
+ }
+
+ return S_OK;
+ }
+ else
+ {
+ // Case 2: Base image format is not supported by WIC, so we have to convert, generate, and convert back
+ assert( metadata.format != DXGI_FORMAT_R32G32B32A32_FLOAT );
+
+ TexMetadata mdata2 = metadata;
+ mdata2.mipLevels = levels;
+ mdata2.format = DXGI_FORMAT_R32G32B32A32_FLOAT;
+ ScratchImage tMipChain;
+ HRESULT hr = tMipChain.Initialize( mdata2 );
+ if ( FAILED(hr) )
+ return hr;
+
+ for( size_t item = 0; item < metadata.arraySize; ++item )
+ {
+ size_t index = metadata.ComputeIndex( 0, item, 0 );
+ if ( index >= nimages )
+ return E_FAIL;
+
+ const Image& baseImage = srcImages[ index ];
+
+ ScratchImage temp;
+ hr = _ConvertToR32G32B32A32( baseImage, temp );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *timg = temp.GetImage( 0, 0, 0 );
+ if ( !timg )
+ return E_POINTER;
+
+ hr = _GenerateMipMapsUsingWIC( *timg, filter, levels, GUID_WICPixelFormat128bppRGBAFloat, tMipChain, item );
+ if ( FAILED(hr) )
+ return hr;
+ }
+
+ return _ConvertFromR32G32B32A32( tMipChain.GetImages(), tMipChain.GetImageCount(), tMipChain.GetMetadata(), metadata.format, mipChain );
+ }
+ }
+ break;
+
+ default:
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );;
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+// Generate mipmap chain for volume texture
+//-------------------------------------------------------------------------------------
+HRESULT GenerateMipMaps3D( const Image* baseImages, size_t depth, DWORD filter, size_t levels, ScratchImage& mipChain )
+{
+ if ( !baseImages || !depth )
+ return E_INVALIDARG;
+
+ DXGI_FORMAT format = baseImages[0].format;
+ size_t width = baseImages[0].width;
+ size_t height = baseImages[0].height;
+
+ if ( !ispow2(width) || !ispow2(height) || !ispow2(depth) )
+ return E_INVALIDARG;
+
+ if ( !_CalculateMipLevels3D(width, height, depth, levels) )
+ return E_INVALIDARG;
+
+ for( size_t slice=0; slice < depth; ++slice )
+ {
+ if ( !baseImages[slice].pixels )
+ return E_POINTER;
+
+ if ( baseImages[slice].format != format || baseImages[slice].width != width || baseImages[slice].height != height )
+ {
+ // All base images must be the same format, width, and height
+ return E_FAIL;
+ }
+ }
+
+ if ( IsCompressed( format ) )
+ {
+ // We don't support generating mipmaps from compressed images, as those should be generated before compression
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ HRESULT hr;
+
+ static_assert( TEX_FILTER_POINT == 0x100000, "TEX_FILTER_ flag values don't match TEX_FILTER_MASK" );
+ switch( filter & TEX_FILTER_MASK )
+ {
+ case 0:
+ case TEX_FILTER_FANT:
+ hr = _Setup3DMips( baseImages, depth, levels, mipChain );
+ if ( FAILED(hr) )
+ return hr;
+
+ // For decimation, Fant is equivalent to a Box filter
+ hr = _Generate3DMipsBoxFilter( depth, levels, mipChain );
+ if ( FAILED(hr) )
+ mipChain.Release();
+ return hr;
+
+ case WIC_FLAGS_FILTER_POINT:
+ hr = _Setup3DMips( baseImages, depth, levels, mipChain );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = _Generate3DMipsPointFilter( depth, levels, mipChain );
+ if ( FAILED(hr) )
+ mipChain.Release();
+ return hr;
+
+ case WIC_FLAGS_FILTER_LINEAR:
+ // Need to implement a 3D bi-linear filter (2x2x2)
+ return E_NOTIMPL;
+
+ case WIC_FLAGS_FILTER_CUBIC:
+ // Need to implement a 3D bi-cubic filter (3x3x3)
+ return E_NOTIMPL;
+
+ default:
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );;
+ }
+}
+
+HRESULT GenerateMipMaps3D( const Image* srcImages, size_t nimages, const TexMetadata& metadata,
+ DWORD filter, size_t levels, ScratchImage& mipChain )
+{
+ if ( !srcImages || !nimages || !IsValid(metadata.format)
+ || !ispow2(metadata.width) || !ispow2(metadata.height) || !ispow2(metadata.depth) )
+ return E_INVALIDARG;
+
+ if ( metadata.dimension != TEX_DIMENSION_TEXTURE3D
+ || IsCompressed( metadata.format ) || IsVideo( metadata.format ) )
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+
+ if ( !_CalculateMipLevels3D(metadata.width, metadata.height, metadata.depth, levels) )
+ return E_INVALIDARG;
+
+ std::vector<const Image> baseImages;
+ baseImages.reserve( metadata.depth );
+ for( size_t slice=0; slice < metadata.depth; ++slice )
+ {
+ size_t index = metadata.ComputeIndex( 0, 0, slice );
+ if ( index >= nimages )
+ return E_FAIL;
+
+ const Image& src = srcImages[ index ];
+ if ( !src.pixels )
+ return E_POINTER;
+
+ if ( src.format != metadata.format || src.width != metadata.width || src.height != metadata.height )
+ {
+ // All base images must be the same format, width, and height
+ return E_FAIL;
+ }
+
+ baseImages.push_back( src );
+ }
+
+ assert( baseImages.size() == metadata.depth );
+
+ HRESULT hr;
+
+ static_assert( TEX_FILTER_POINT == 0x100000, "TEX_FILTER_ flag values don't match TEX_FILTER_MASK" );
+ switch( filter & TEX_FILTER_MASK )
+ {
+ case 0:
+ case TEX_FILTER_FANT:
+ hr = _Setup3DMips( &baseImages[0], metadata.depth, levels, mipChain );
+ if ( FAILED(hr) )
+ return hr;
+
+ // For decimation, Fant is equivalent to a Box filter
+ hr = _Generate3DMipsBoxFilter( metadata.depth, levels, mipChain );
+ if ( FAILED(hr) )
+ mipChain.Release();
+ return hr;
+
+ case WIC_FLAGS_FILTER_POINT:
+ hr = _Setup3DMips( &baseImages[0], metadata.depth, levels, mipChain );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = _Generate3DMipsPointFilter( metadata.depth, levels, mipChain );
+ if ( FAILED(hr) )
+ mipChain.Release();
+ return hr;
+
+ case WIC_FLAGS_FILTER_LINEAR:
+ // Need to implement a 3D bi-linear filter (2x2x2)
+ return E_NOTIMPL;
+
+ case WIC_FLAGS_FILTER_CUBIC:
+ // Need to implement a 3D bi-cubic filter (3x3x3)
+ return E_NOTIMPL;
+
+ default:
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );;
+ }
+}
+
+#endif /* !__MINGW32__ */
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexMisc.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexMisc.cpp
new file mode 100644
index 00000000..988fb1fb
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTexMisc.cpp
@@ -0,0 +1,265 @@
+//-------------------------------------------------------------------------------------
+// DirectXTexMisc.cpp
+//
+// DirectX Texture Library - Misc image operations
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+namespace DirectX
+{
+
+//-------------------------------------------------------------------------------------
+static HRESULT _ComputeMSE( _In_ const Image& image1, _In_ const Image& image2,
+ _Out_ float& mse, _Out_opt_cap_c_(4) float* mseV )
+{
+ if ( !image1.pixels || !image2.pixels )
+ return E_POINTER;
+
+ assert( image1.width == image2.width && image1.height == image2.height );
+ assert( !IsCompressed( image1.format ) && !IsCompressed( image2.format ) );
+
+ const size_t width = image1.width;
+
+ ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast<XMVECTOR*>( _aligned_malloc( (sizeof(XMVECTOR)*width)*2, 16 ) ) );
+ if ( !scanline )
+ return E_OUTOFMEMORY;
+
+ const uint8_t *pSrc1 = image1.pixels;
+ const size_t rowPitch1 = image1.rowPitch;
+
+ const uint8_t *pSrc2 = image2.pixels;
+ const size_t rowPitch2 = image2.rowPitch;
+
+ XMVECTOR acc = XMVectorZero();
+
+ for( size_t h = 0; h < image1.height; ++h )
+ {
+ XMVECTOR* ptr1 = scanline.get();
+ if ( !_LoadScanline( ptr1, width, pSrc1, rowPitch1, image1.format ) )
+ return E_FAIL;
+
+ XMVECTOR* ptr2 = scanline.get() + width;
+ if ( !_LoadScanline( ptr2, width, pSrc2, rowPitch2, image2.format ) )
+ return E_FAIL;
+
+ for( size_t i = 0; i < width; ++i, ++ptr1, ++ptr2 )
+ {
+ // sum[ (I1 - I2)^2 ]
+ XMVECTOR v = XMVectorSubtract( *ptr1, *ptr2 );
+ acc = XMVectorMultiplyAdd( v, v, acc );
+ }
+
+ pSrc1 += rowPitch1;
+ pSrc2 += rowPitch2;
+ }
+
+ // MSE = sum[ (I1 - I2)^2 ] / w*h
+ XMVECTOR d = XMVectorReplicate( float(image1.width * image1.height) );
+ XMVECTOR v = XMVectorDivide( acc, d );
+ if ( mseV )
+ {
+ XMStoreFloat4( reinterpret_cast<XMFLOAT4*>( mseV ), v );
+ mse = mseV[0] + mseV[1] + mseV[2] + mseV[3];
+ }
+ else
+ {
+ XMFLOAT4 _mseV;
+ XMStoreFloat4( &_mseV, v );
+ mse = _mseV.x + _mseV.y + _mseV.z + _mseV.w;
+ }
+
+ return S_OK;
+}
+
+
+//=====================================================================================
+// Entry points
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// Copies a rectangle from one image into another
+//-------------------------------------------------------------------------------------
+HRESULT CopyRectangle( const Image& srcImage, const Rect& srcRect, const Image& dstImage, DWORD filter, size_t xOffset, size_t yOffset )
+{
+ if ( !srcImage.pixels || !dstImage.pixels )
+ return E_POINTER;
+
+ if ( IsCompressed( srcImage.format ) || IsCompressed( dstImage.format ) )
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+
+ // Validate rectangle/offset
+ if ( !srcRect.w || !srcRect.h || ( (srcRect.x + srcRect.w) > srcImage.width ) || ( (srcRect.y + srcRect.h) > srcImage.height ) )
+ {
+ return E_INVALIDARG;
+ }
+
+ if ( ( (xOffset + srcRect.w) > dstImage.width ) || ( (yOffset + srcRect.h) > dstImage.height ) )
+ {
+ return E_INVALIDARG;
+ }
+
+ // Compute source bytes-per-pixel
+ size_t sbpp = BitsPerPixel( srcImage.format );
+ if ( !sbpp )
+ return E_FAIL;
+
+ if ( sbpp < 8 )
+ {
+ // We don't support monochrome (DXGI_FORMAT_R1_UNORM)
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ const uint8_t* pEndSrc = srcImage.pixels + srcImage.rowPitch*srcImage.height;
+ const uint8_t* pEndDest = dstImage.pixels + dstImage.rowPitch*dstImage.height;
+
+ // Round to bytes
+ sbpp = ( sbpp + 7 ) / 8;
+
+ const uint8_t* pSrc = srcImage.pixels + (srcRect.y * srcImage.rowPitch) + (srcRect.x * sbpp);
+
+ if ( srcImage.format == dstImage.format )
+ {
+ // Direct copy case (avoid intermediate conversions)
+ uint8_t* pDest = dstImage.pixels + (yOffset * dstImage.rowPitch) + (xOffset * sbpp);
+ const size_t copyW = srcRect.w * sbpp;
+ for( size_t h=0; h < srcRect.h; ++h )
+ {
+ if ( ( (pSrc+copyW) > pEndSrc ) || (pDest > pEndDest) )
+ return E_FAIL;
+
+ memcpy_s( pDest, pEndDest - pDest, pSrc, copyW );
+
+ pSrc += srcImage.rowPitch;
+ pDest += dstImage.rowPitch;
+ }
+
+ return S_OK;
+ }
+
+ // Compute destination bytes-per-pixel (not the same format as source)
+ size_t dbpp = BitsPerPixel( dstImage.format );
+ if ( !dbpp )
+ return E_FAIL;
+
+ if ( dbpp < 8 )
+ {
+ // We don't support monochrome (DXGI_FORMAT_R1_UNORM)
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ // Round to bytes
+ dbpp = ( dbpp + 7 ) / 8;
+
+ uint8_t* pDest = dstImage.pixels + (yOffset * dstImage.rowPitch) + (xOffset * dbpp);
+
+ ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast<XMVECTOR*>( _aligned_malloc( (sizeof(XMVECTOR)*srcRect.w), 16 ) ) );
+ if ( !scanline )
+ return E_OUTOFMEMORY;
+
+ const size_t copyS = srcRect.w * sbpp;
+ const size_t copyD = srcRect.w * dbpp;
+
+ for( size_t h=0; h < srcRect.h; ++h )
+ {
+ if ( ( (pSrc+copyS) > pEndSrc) || ((pDest+copyD) > pEndDest) )
+ return E_FAIL;
+
+ if ( !_LoadScanline( scanline.get(), srcRect.w, pSrc, copyS, srcImage.format ) )
+ return E_FAIL;
+
+ _ConvertScanline( scanline.get(), srcRect.w, dstImage.format, srcImage.format, filter );
+
+ if ( !_StoreScanline( pDest, copyD, dstImage.format, scanline.get(), srcRect.w ) )
+ return E_FAIL;
+
+ pSrc += srcImage.rowPitch;
+ pDest += dstImage.rowPitch;
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Computes the Mean-Squared-Error (MSE) between two images
+//-------------------------------------------------------------------------------------
+HRESULT ComputeMSE( const Image& image1, const Image& image2, float& mse, float* mseV )
+{
+ if ( !image1.pixels || !image2.pixels )
+ return E_POINTER;
+
+ if ( image1.width != image2.width || image1.height != image2.height )
+ return E_INVALIDARG;
+
+ if ( IsCompressed(image1.format) )
+ {
+ if ( IsCompressed(image2.format) )
+ {
+ // Case 1: both images are compressed, expand to RGBA32F
+ ScratchImage temp1;
+ HRESULT hr = Decompress( image1, DXGI_FORMAT_R32G32B32A32_FLOAT, temp1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ ScratchImage temp2;
+ hr = Decompress( image2, DXGI_FORMAT_R32G32B32A32_FLOAT, temp2 );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image* img1 = temp1.GetImage(0,0,0);
+ const Image* img2 = temp2.GetImage(0,0,0);
+ if ( !img1 || !img2 )
+ return E_POINTER;
+
+ return _ComputeMSE( *img1, *img2, mse, mseV );
+ }
+ else
+ {
+ // Case 2: image1 is compressed, expand to RGBA32F
+ ScratchImage temp;
+ HRESULT hr = Decompress( image1, DXGI_FORMAT_R32G32B32A32_FLOAT, temp );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image* img = temp.GetImage(0,0,0);
+ if ( !img )
+ return E_POINTER;
+
+ return _ComputeMSE( *img, image2, mse, mseV );
+ }
+ }
+ else
+ {
+ if ( IsCompressed(image2.format) )
+ {
+ // Case 3: image2 is compressed, expand to RGBA32F
+ ScratchImage temp;
+ HRESULT hr = Decompress( image2, DXGI_FORMAT_R32G32B32A32_FLOAT, temp );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image* img = temp.GetImage(0,0,0);
+ if ( !img )
+ return E_POINTER;
+
+ return _ComputeMSE( image1, *img, mse, mseV );
+ }
+ else
+ {
+ // Case 4: neither image is compressed
+ return _ComputeMSE( image1, image2, mse, mseV );
+ }
+ }
+}
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexNormalMaps.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexNormalMaps.cpp
new file mode 100644
index 00000000..ad594933
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTexNormalMaps.cpp
@@ -0,0 +1,377 @@
+//-------------------------------------------------------------------------------------
+// DirectXTexNormalMaps.cpp
+//
+// DirectX Texture Library - Normal map operations
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+namespace DirectX
+{
+
+#pragma prefast(suppress : 25000, "FXMVECTOR is 16 bytes")
+static inline float _EvaluateColor( _In_ FXMVECTOR val, _In_ DWORD flags )
+{
+ XMFLOAT4A f;
+
+ static XMVECTORF32 lScale = { 0.2125f, 0.7154f, 0.0721f, 1.f };
+
+ static_assert( CNMAP_CHANNEL_RED == 0x1, "CNMAP_CHANNEL_ flag values don't match mask" );
+ switch( flags & 0xf )
+ {
+ case 0:
+ case CNMAP_CHANNEL_RED: return XMVectorGetX( val );
+ case CNMAP_CHANNEL_GREEN: return XMVectorGetY( val );
+ case CNMAP_CHANNEL_BLUE: return XMVectorGetZ( val );
+ case CNMAP_CHANNEL_ALPHA: return XMVectorGetW( val );
+
+ case CNMAP_CHANNEL_LUMINANCE:
+ {
+ XMVECTOR v = XMVectorMultiply( val, lScale );
+ XMStoreFloat4A( &f, v );
+ return f.x + f.y + f.z;
+ }
+ break;
+
+ default:
+ assert(false);
+ return 0.f;
+ }
+}
+
+static void _EvaluateRow( _In_count_(width) const XMVECTOR* pSource, _Out_cap_(width+2) float* pDest,
+ _In_ size_t width, _In_ DWORD flags )
+{
+ assert( pSource && pDest );
+ assert( width > 0 );
+
+ for( size_t x = 0; x < width; ++x )
+ {
+ pDest[x+1] = _EvaluateColor( pSource[x], flags );
+ }
+
+ if ( flags & CNMAP_MIRROR_U )
+ {
+ // Mirror in U
+ pDest[0] = _EvaluateColor( pSource[0], flags );
+ pDest[width+1] = _EvaluateColor( pSource[width-1], flags );
+ }
+ else
+ {
+ // Wrap in U
+ pDest[0] = _EvaluateColor( pSource[width-1], flags );
+ pDest[width+1] = _EvaluateColor( pSource[0], flags );
+ }
+}
+
+static HRESULT _ComputeNMap( _In_ const Image& srcImage, _In_ DWORD flags, _In_ float amplitude,
+ _In_ DXGI_FORMAT format, _In_ const Image& normalMap )
+{
+ if ( !srcImage.pixels || !normalMap.pixels )
+ return E_INVALIDARG;
+
+ assert( !IsCompressed(format) && !IsTypeless( format ) );
+
+ const DWORD convFlags = _GetConvertFlags( format );
+ if ( !convFlags )
+ return E_FAIL;
+
+ if ( !( convFlags & (CONVF_UNORM | CONVF_SNORM | CONVF_FLOAT) ) )
+ HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+
+ const size_t width = srcImage.width;
+ const size_t height = srcImage.height;
+ if ( width != normalMap.width || height != normalMap.height )
+ return E_FAIL;
+
+ // Allocate temporary space (4 scanlines and 3 evaluated rows)
+ ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast<XMVECTOR*>( _aligned_malloc( (sizeof(XMVECTOR)*width*4), 16 ) ) );
+ if ( !scanline )
+ return E_OUTOFMEMORY;
+
+ ScopedAlignedArrayFloat buffer( reinterpret_cast<float*>( _aligned_malloc( ( ( sizeof(float) * ( width + 2 ) ) * 3 ), 16 ) ) );
+ if ( !buffer )
+ return E_OUTOFMEMORY;
+
+ uint8_t* pDest = normalMap.pixels;
+ if ( !pDest )
+ return E_POINTER;
+
+ XMVECTOR* row0 = scanline.get();
+ XMVECTOR* row1 = row0 + width;
+ XMVECTOR* row2 = row1 + width;
+ XMVECTOR* target = row2 + width;
+
+ float* val0 = buffer.get();
+ float* val1 = val0 + width + 2;
+ float* val2 = val1 + width + 2;
+
+ const size_t rowPitch = srcImage.rowPitch;
+ const uint8_t* pSrc = srcImage.pixels;
+
+ // Read first scanline row into 'row1'
+ if ( !_LoadScanline( row1, width, pSrc, rowPitch, srcImage.format ) )
+ return E_FAIL;
+
+ // Setup 'row0'
+ if ( flags & CNMAP_MIRROR_V )
+ {
+ // Mirror first row
+ memcpy_s( row0, rowPitch, row1, rowPitch );
+ }
+ else
+ {
+ // Read last row (Wrap V)
+ if ( !_LoadScanline( row0, width, pSrc + (rowPitch * (height-1)), rowPitch, srcImage.format ) )
+ return E_FAIL;
+ }
+
+ // Evaluate the initial rows
+ _EvaluateRow( row0, val0, width, flags );
+ _EvaluateRow( row1, val1, width, flags );
+
+ pSrc += rowPitch;
+
+ for( size_t y = 0; y < height; ++y )
+ {
+ // Load next scanline of source image
+ if ( y < (height-1) )
+ {
+ if ( !_LoadScanline( row2, width, pSrc, rowPitch, srcImage.format ) )
+ return E_FAIL;
+ }
+ else
+ {
+ if ( flags & CNMAP_MIRROR_V )
+ {
+ // Use last row of source image
+ if ( !_LoadScanline( row2, width, srcImage.pixels + (rowPitch * (height-1)), rowPitch, srcImage.format ) )
+ return E_FAIL;
+ }
+ else
+ {
+ // Use first row of source image (Wrap V)
+ if ( !_LoadScanline( row2, width, srcImage.pixels, rowPitch, srcImage.format ) )
+ return E_FAIL;
+ }
+ }
+
+ // Evaluate row
+ _EvaluateRow( row2, val2, width, flags );
+
+ // Generate target scanline
+ XMVECTOR *dptr = target;
+ for( size_t x = 0; x < width; ++x )
+ {
+ // Compute normal via central differencing
+ float totDelta = ( val0[x] - val0[x+2] ) + ( val1[x] - val1[x+2] ) + ( val2[x] - val2[x+2] );
+ float deltaZX = totDelta * amplitude / 6.f;
+
+ totDelta = ( val0[x] - val2[x] ) + ( val0[x+1] - val2[x+1] ) + ( val0[x+2] - val2[x+2] );
+ float deltaZY = totDelta * amplitude / 6.f;
+
+ XMVECTOR vx = XMVectorSetZ( g_XMNegIdentityR0, deltaZX ); // (-1.0f, 0.0f, deltaZX)
+ XMVECTOR vy = XMVectorSetZ( g_XMNegIdentityR1, deltaZY ); // (0.0f, -1.0f, deltaZY)
+
+ XMVECTOR normal = XMVector3Normalize( XMVector3Cross( vx, vy ) );
+
+ // Compute alpha (1.0 or an occlusion term)
+ float alpha = 1.f;
+
+ if ( flags & CNMAP_COMPUTE_OCCLUSION )
+ {
+ float delta = 0.f;
+ float c = val1[x+1];
+
+ float t = val0[x] - c; if ( t > 0.f ) delta += t;
+ t = val0[x+1] - c; if ( t > 0.f ) delta += t;
+ t = val0[x+2] - c; if ( t > 0.f ) delta += t;
+ t = val1[x] - c; if ( t > 0.f ) delta += t;
+ // Skip current pixel
+ t = val1[x+2] - c; if ( t > 0.f ) delta += t;
+ t = val2[x] - c; if ( t > 0.f ) delta += t;
+ t = val2[x+1] - c; if ( t > 0.f ) delta += t;
+ t = val2[x+2] - c; if ( t > 0.f ) delta += t;
+
+ // Average delta (divide by 8, scale by amplitude factor)
+ delta *= 0.125f * amplitude;
+ if ( delta > 0.f )
+ {
+ // If < 0, then no occlusion
+ float r = sqrtf( 1.f + delta*delta );
+ alpha = (r - delta) / r;
+ }
+ }
+
+ // Encode based on target format
+ if ( convFlags & CONVF_UNORM )
+ {
+ // 0.5f*normal + 0.5f -or- invert sign case: -0.5f*normal + 0.5f
+ XMVECTOR n1 = XMVectorMultiplyAdd( (flags & CNMAP_INVERT_SIGN) ? g_XMNegativeOneHalf : g_XMOneHalf, normal, g_XMOneHalf );
+ *dptr++ = XMVectorSetW( n1, alpha );
+ }
+ else if ( flags & CNMAP_INVERT_SIGN )
+ {
+ *dptr++ = XMVectorSetW( XMVectorNegate( normal ), alpha );
+ }
+ else
+ {
+ *dptr++ = XMVectorSetW( normal, alpha );
+ }
+ }
+
+ if ( !_StoreScanline( pDest, normalMap.rowPitch, format, target, width ) )
+ return E_FAIL;
+
+ // Cycle buffers
+ float* temp = val0;
+ val0 = val1;
+ val1 = val2;
+ val2 = temp;
+
+ pSrc += rowPitch;
+ pDest += normalMap.rowPitch;
+ }
+
+ return S_OK;
+}
+
+
+//=====================================================================================
+// Entry points
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// Generates a normal map from a height-map
+//-------------------------------------------------------------------------------------
+HRESULT ComputeNormalMap( const Image& srcImage, DWORD flags, float amplitude,
+ DXGI_FORMAT format, ScratchImage& normalMap )
+{
+ if ( !srcImage.pixels || !IsValid(format) || IsCompressed( format ) || IsTypeless( format ) )
+ return E_INVALIDARG;
+
+ static_assert( CNMAP_CHANNEL_RED == 0x1, "CNMAP_CHANNEL_ flag values don't match mask" );
+ switch( flags & 0xf )
+ {
+ case 0:
+ case CNMAP_CHANNEL_RED:
+ case CNMAP_CHANNEL_GREEN:
+ case CNMAP_CHANNEL_BLUE:
+ case CNMAP_CHANNEL_ALPHA:
+ case CNMAP_CHANNEL_LUMINANCE:
+ break;
+
+ default:
+ return E_INVALIDARG;
+ }
+
+ if ( IsCompressed( srcImage.format ) || IsTypeless( srcImage.format ) )
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+
+ // Setup target image
+ normalMap.Release();
+
+ HRESULT hr = normalMap.Initialize2D( format, srcImage.width, srcImage.height, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *img = normalMap.GetImage( 0, 0, 0 );
+ if ( !img )
+ {
+ normalMap.Release();
+ return E_POINTER;
+ }
+
+ hr = _ComputeNMap( srcImage, flags, amplitude, format, *img );
+ if ( FAILED(hr) )
+ {
+ normalMap.Release();
+ return hr;
+ }
+
+ return S_OK;
+}
+
+HRESULT ComputeNormalMap( const Image* srcImages, size_t nimages, const TexMetadata& metadata,
+ DWORD flags, float amplitude, DXGI_FORMAT format, ScratchImage& normalMaps )
+{
+ if ( !srcImages || !nimages )
+ return E_INVALIDARG;
+
+ if ( !IsValid(format) || IsCompressed(format) || IsTypeless(format) )
+ return E_INVALIDARG;
+
+ static_assert( CNMAP_CHANNEL_RED == 0x1, "CNMAP_CHANNEL_ flag values don't match mask" );
+ switch( flags & 0xf )
+ {
+ case 0:
+ case CNMAP_CHANNEL_RED:
+ case CNMAP_CHANNEL_GREEN:
+ case CNMAP_CHANNEL_BLUE:
+ case CNMAP_CHANNEL_ALPHA:
+ case CNMAP_CHANNEL_LUMINANCE:
+ break;
+
+ default:
+ return E_INVALIDARG;
+ }
+
+ normalMaps.Release();
+
+ TexMetadata mdata2 = metadata;
+ mdata2.format = format;
+ HRESULT hr = normalMaps.Initialize( mdata2 );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( nimages != normalMaps.GetImageCount() )
+ {
+ normalMaps.Release();
+ return E_FAIL;
+ }
+
+ const Image* dest = normalMaps.GetImages();
+ if ( !dest )
+ {
+ normalMaps.Release();
+ return E_POINTER;
+ }
+
+ for( size_t index=0; index < nimages; ++index )
+ {
+ assert( dest[ index ].format == format );
+
+ const Image& src = srcImages[ index ];
+ if ( IsCompressed( src.format ) || IsTypeless( src.format ) )
+ {
+ normalMaps.Release();
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ if ( src.width != dest[ index ].width || src.height != dest[ index ].height )
+ {
+ normalMaps.Release();
+ return E_FAIL;
+ }
+
+ hr = _ComputeNMap( src, flags, amplitude, format, dest[ index ] );
+ if ( FAILED(hr) )
+ {
+ normalMaps.Release();
+ return hr;
+ }
+ }
+
+ return S_OK;
+}
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexP.h b/thirdparty/directxtex/DirectXTex/DirectXTexP.h
new file mode 100644
index 00000000..4659b9bb
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTexP.h
@@ -0,0 +1,199 @@
+//-------------------------------------------------------------------------------------
+// DirectXTexp.h
+//
+// DirectX Texture Library - Private header
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#if defined(_MSC_VER) && (_MSC_VER > 1000)
+#pragma once
+#endif
+
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+
+#ifdef USE_XNAMATH
+#include <xnamath.h>
+#else
+#include <directxmath.h>
+#include <directxpackedvector.h>
+#endif
+
+#include <assert.h>
+
+#include <malloc.h>
+#include <memory>
+
+#include <vector>
+
+#include <stdlib.h>
+#include <search.h>
+
+#include <ole2.h>
+
+#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) || defined(_WIN7_PLATFORM_UPDATE)
+#include <d2d1.h>
+#endif
+
+#pragma warning(push)
+#pragma warning(disable : 4005)
+#include <wincodec.h>
+#pragma warning(pop)
+
+#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) && !defined(DXGI_1_2_FORMATS)
+#define DXGI_1_2_FORMATS
+#endif
+
+#include "DirectXTex.h"
+
+#include "scoped.h"
+
+struct IWICImagingFactory;
+
+#define TEX_FILTER_MASK 0xF00000
+
+namespace DirectX
+{
+ //---------------------------------------------------------------------------------
+ // WIC helper functions
+ DXGI_FORMAT _WICToDXGI( _In_ const GUID& guid );
+ bool _DXGIToWIC( _In_ DXGI_FORMAT format, _Out_ GUID& guid );
+
+ IWICImagingFactory* _GetWIC();
+
+ bool _IsWIC2();
+
+ inline WICBitmapDitherType _GetWICDither( _In_ DWORD flags )
+ {
+ static_assert( TEX_FILTER_DITHER == 0x10000, "TEX_FILTER_DITHER* flag values don't match mask" );
+
+ static_assert( TEX_FILTER_DITHER == WIC_FLAGS_DITHER, "TEX_FILTER_DITHER* should match WIC_FLAGS_DITHER*" );
+ static_assert( TEX_FILTER_DITHER_DIFFUSION == WIC_FLAGS_DITHER_DIFFUSION, "TEX_FILTER_DITHER* should match WIC_FLAGS_DITHER*" );
+
+ switch( flags & 0xF0000 )
+ {
+ case TEX_FILTER_DITHER:
+ return WICBitmapDitherTypeOrdered4x4;
+
+ case TEX_FILTER_DITHER_DIFFUSION:
+ return WICBitmapDitherTypeErrorDiffusion;
+
+ default:
+ return WICBitmapDitherTypeNone;
+ }
+ }
+
+ inline WICBitmapInterpolationMode _GetWICInterp( _In_ DWORD flags )
+ {
+ static_assert( TEX_FILTER_POINT == 0x100000, "TEX_FILTER_ flag values don't match TEX_FILTER_MASK" );
+
+ static_assert( TEX_FILTER_POINT == WIC_FLAGS_FILTER_POINT, "TEX_FILTER_* flags should match WIC_FLAGS_FILTER_*" );
+ static_assert( TEX_FILTER_LINEAR == WIC_FLAGS_FILTER_LINEAR, "TEX_FILTER_* flags should match WIC_FLAGS_FILTER_*" );
+ static_assert( TEX_FILTER_CUBIC == WIC_FLAGS_FILTER_CUBIC, "TEX_FILTER_* flags should match WIC_FLAGS_FILTER_*" );
+ static_assert( TEX_FILTER_FANT == WIC_FLAGS_FILTER_FANT, "TEX_FILTER_* flags should match WIC_FLAGS_FILTER_*" );
+
+ switch( flags & TEX_FILTER_MASK )
+ {
+ case TEX_FILTER_POINT:
+ return WICBitmapInterpolationModeNearestNeighbor;
+
+ case TEX_FILTER_LINEAR:
+ return WICBitmapInterpolationModeLinear;
+
+ case TEX_FILTER_CUBIC:
+ return WICBitmapInterpolationModeCubic;
+
+ case TEX_FILTER_FANT:
+ default:
+ return WICBitmapInterpolationModeFant;
+ }
+ }
+
+ //---------------------------------------------------------------------------------
+ // Image helper functions
+ void _DetermineImageArray( _In_ const TexMetadata& metadata, _In_ DWORD cpFlags,
+ _Out_ size_t& nImages, _Out_ size_t& pixelSize );
+
+ bool _SetupImageArray( _In_bytecount_(pixelSize) uint8_t *pMemory, _In_ size_t pixelSize,
+ _In_ const TexMetadata& metadata, _In_ DWORD cpFlags,
+ _Out_cap_(nImages) Image* images, _In_ size_t nImages );
+
+ //---------------------------------------------------------------------------------
+ // Conversion helper functions
+
+ enum TEXP_SCANLINE_FLAGS
+ {
+ TEXP_SCANLINE_NONE = 0,
+ TEXP_SCANLINE_SETALPHA = 0x1, // Set alpha channel to known opaque value
+ TEXP_SCANLINE_LEGACY = 0x2, // Enables specific legacy format conversion cases
+ };
+
+ enum CONVERT_FLAGS
+ {
+ CONVF_FLOAT = 0x1,
+ CONVF_UNORM = 0x2,
+ CONVF_UINT = 0x4,
+ CONVF_SNORM = 0x8,
+ CONVF_SINT = 0x10,
+ CONVF_DEPTH = 0x20,
+ CONVF_STENCIL = 0x40,
+ CONVF_SHAREDEXP = 0x80,
+ CONVF_BGR = 0x100,
+ CONVF_X2 = 0x200,
+ CONVF_PACKED = 0x400,
+ CONVF_BC = 0x800,
+ CONVF_R = 0x10000,
+ CONVF_G = 0x20000,
+ CONVF_B = 0x40000,
+ CONVF_A = 0x80000,
+ CONVF_RGB_MASK = 0x70000,
+ CONVF_RGBA_MASK = 0xF0000,
+ };
+
+ DWORD _GetConvertFlags( _In_ DXGI_FORMAT format );
+
+ void _CopyScanline( _Out_bytecap_(outSize) LPVOID pDestination, _In_ size_t outSize,
+ _In_bytecount_(inSize) LPCVOID pSource, _In_ size_t inSize,
+ _In_ DXGI_FORMAT format, _In_ DWORD flags );
+
+ void _SwizzleScanline( _Out_bytecap_(outSize) LPVOID pDestination, _In_ size_t outSize,
+ _In_bytecount_(inSize) LPCVOID pSource, _In_ size_t inSize,
+ _In_ DXGI_FORMAT format, _In_ DWORD flags );
+
+ bool _ExpandScanline( _Out_bytecap_(outSize) LPVOID pDestination, _In_ size_t outSize,
+ _In_ DXGI_FORMAT outFormat,
+ _In_bytecount_(inSize) LPCVOID pSource, _In_ size_t inSize,
+ _In_ DXGI_FORMAT inFormat, _In_ DWORD flags );
+
+ bool _LoadScanline( _Out_cap_(count) XMVECTOR* pDestination, _In_ size_t count,
+ _In_bytecount_(size) LPCVOID pSource, _In_ size_t size, _In_ DXGI_FORMAT format );
+
+ bool _StoreScanline( _Out_bytecap_(size) LPVOID pDestination, _In_ size_t size, _In_ DXGI_FORMAT format,
+ _In_count_(count) const XMVECTOR* pSource, _In_ size_t count );
+
+ HRESULT _ConvertToR32G32B32A32( _In_ const Image& srcImage, _Inout_ ScratchImage& image );
+
+ HRESULT _ConvertFromR32G32B32A32( _In_ const Image& srcImage, _In_ const Image& destImage );
+ HRESULT _ConvertFromR32G32B32A32( _In_ const Image& srcImage, _In_ DXGI_FORMAT format, _Inout_ ScratchImage& image );
+ HRESULT _ConvertFromR32G32B32A32( _In_count_(nimages) const Image* srcImages, _In_ size_t nimages, _In_ const TexMetadata& metadata,
+ _In_ DXGI_FORMAT format, _Out_ ScratchImage& result );
+
+ void _ConvertScanline( _Inout_count_(count) XMVECTOR* pBuffer, _In_ size_t count,
+ _In_ DXGI_FORMAT outFormat, _In_ DXGI_FORMAT inFormat, _In_ DWORD flags );
+
+ //---------------------------------------------------------------------------------
+ // DDS helper functions
+ HRESULT _EncodeDDSHeader( _In_ const TexMetadata& metadata, DWORD flags,
+ _Out_opt_cap_x_(maxsize) LPVOID pDestination, _In_ size_t maxsize, _Out_ size_t& required );
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexResize.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexResize.cpp
new file mode 100644
index 00000000..9f70c929
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTexResize.cpp
@@ -0,0 +1,358 @@
+//-------------------------------------------------------------------------------------
+// DirectXTexResize.cpp
+//
+// DirectX Texture Library - Image resizing operations
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+namespace DirectX
+{
+
+extern HRESULT _ResizeSeparateColorAndAlpha( _In_ IWICImagingFactory* pWIC, _In_ IWICBitmap* original,
+ _In_ size_t newWidth, _In_ size_t newHeight, _In_ DWORD filter, _Inout_ const Image* img );
+
+//-------------------------------------------------------------------------------------
+// Do image resize using WIC
+//-------------------------------------------------------------------------------------
+static HRESULT _PerformResizeUsingWIC( _In_ const Image& srcImage, _In_ DWORD filter,
+ _In_ const WICPixelFormatGUID& pfGUID, _In_ const Image& destImage )
+{
+ if ( !srcImage.pixels || !destImage.pixels )
+ return E_POINTER;
+
+ assert( srcImage.format == destImage.format );
+
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ ScopedObject<IWICComponentInfo> componentInfo;
+ HRESULT hr = pWIC->CreateComponentInfo( pfGUID, &componentInfo );
+ if ( FAILED(hr) )
+ return hr;
+
+ ScopedObject<IWICPixelFormatInfo2> pixelFormatInfo;
+ hr = componentInfo->QueryInterface( __uuidof(IWICPixelFormatInfo2), (void**)&pixelFormatInfo );
+ if ( FAILED(hr) )
+ return hr;
+
+ BOOL supportsTransparency = FALSE;
+ hr = pixelFormatInfo->SupportsTransparency( &supportsTransparency );
+ if ( FAILED(hr) )
+ return hr;
+
+ ScopedObject<IWICBitmap> source;
+ hr = pWIC->CreateBitmapFromMemory( static_cast<UINT>( srcImage.width ), static_cast<UINT>( srcImage.height ), pfGUID,
+ static_cast<UINT>( srcImage.rowPitch ), static_cast<UINT>( srcImage.slicePitch ),
+ srcImage.pixels, &source );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( (filter & TEX_FILTER_SEPARATE_ALPHA) && supportsTransparency )
+ {
+ hr = _ResizeSeparateColorAndAlpha( pWIC, source.Get(), destImage.width, destImage.height, filter, &destImage );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ else
+ {
+ ScopedObject<IWICBitmapScaler> scaler;
+ hr = pWIC->CreateBitmapScaler( &scaler );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = scaler->Initialize( source.Get(), static_cast<UINT>( destImage.width ), static_cast<UINT>( destImage.height ), _GetWICInterp( filter ) );
+ if ( FAILED(hr) )
+ return hr;
+
+ WICPixelFormatGUID pfScaler;
+ hr = scaler->GetPixelFormat( &pfScaler );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( memcmp( &pfScaler, &pfGUID, sizeof(WICPixelFormatGUID) ) == 0 )
+ {
+ hr = scaler->CopyPixels( 0, static_cast<UINT>( destImage.rowPitch ), static_cast<UINT>( destImage.slicePitch ), destImage.pixels );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ else
+ {
+ // The WIC bitmap scaler is free to return a different pixel format than the source image, so here we
+ // convert it back
+ ScopedObject<IWICFormatConverter> FC;
+ hr = pWIC->CreateFormatConverter( &FC );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FC->Initialize( scaler.Get(), pfGUID, _GetWICDither( filter ), 0, 0, WICBitmapPaletteTypeCustom );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FC->CopyPixels( 0, static_cast<UINT>( destImage.rowPitch ), static_cast<UINT>( destImage.slicePitch ), destImage.pixels );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Do conversion, resize using WIC, conversion cycle
+//-------------------------------------------------------------------------------------
+static HRESULT _PerformResizeViaF32( _In_ const Image& srcImage, _In_ DWORD filter, _In_ const Image& destImage )
+{
+ if ( !srcImage.pixels || !destImage.pixels )
+ return E_POINTER;
+
+ assert( srcImage.format != DXGI_FORMAT_R32G32B32A32_FLOAT );
+ assert( srcImage.format == destImage.format );
+
+ ScratchImage temp;
+ HRESULT hr = _ConvertToR32G32B32A32( srcImage, temp );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *tsrc = temp.GetImage( 0, 0, 0 );
+ if ( !tsrc )
+ return E_POINTER;
+
+ ScratchImage rtemp;
+ hr = rtemp.Initialize2D( DXGI_FORMAT_R32G32B32A32_FLOAT, destImage.width, destImage.height, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *tdest = rtemp.GetImage( 0, 0, 0 );
+ if ( !tdest )
+ return E_POINTER;
+
+ hr = _PerformResizeUsingWIC( *tsrc, filter, GUID_WICPixelFormat128bppRGBAFloat, *tdest );
+ if ( FAILED(hr) )
+ return hr;
+
+ temp.Release();
+
+ hr = _ConvertFromR32G32B32A32( *tdest, destImage );
+ if ( FAILED(hr) )
+ return hr;
+
+ return S_OK;
+}
+
+
+//=====================================================================================
+// Entry-points
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// Resize image
+//-------------------------------------------------------------------------------------
+HRESULT Resize( const Image& srcImage, size_t width, size_t height, DWORD filter, ScratchImage& image )
+{
+ if ( width == 0 || height == 0 )
+ return E_INVALIDARG;
+
+#ifdef _AMD64_
+ if ( (srcImage.width > 0xFFFFFFFF) || (srcImage.height > 0xFFFFFFFF) )
+ return E_INVALIDARG;
+
+ if ( (width > 0xFFFFFFFF) || (height > 0xFFFFFFFF) )
+ return E_INVALIDARG;
+#endif
+
+ if ( !srcImage.pixels )
+ return E_POINTER;
+
+ if ( IsCompressed( srcImage.format ) )
+ {
+ // We don't support resizing compressed images
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ HRESULT hr = image.Initialize2D( srcImage.format, width, height, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *rimage = image.GetImage( 0, 0, 0 );
+ if ( !rimage )
+ return E_POINTER;
+
+ // WIC only supports CLAMP
+
+ WICPixelFormatGUID pfGUID;
+ if ( _DXGIToWIC( srcImage.format, pfGUID ) )
+ {
+ // Case 1: Source format is supported by Windows Imaging Component
+ hr = _PerformResizeUsingWIC( srcImage, filter, pfGUID, *rimage );
+ }
+ else
+ {
+ // Case 2: Source format is not supported by WIC, so we have to convert, resize, and convert back
+ hr = _PerformResizeViaF32( srcImage, filter, *rimage );
+ }
+
+ if ( FAILED(hr) )
+ {
+ image.Release();
+ return hr;
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Resize image (complex)
+//-------------------------------------------------------------------------------------
+HRESULT Resize( const Image* srcImages, size_t nimages, const TexMetadata& metadata,
+ size_t width, size_t height, DWORD filter, ScratchImage& result )
+{
+ if ( !srcImages || !nimages || width == 0 || height == 0 )
+ return E_INVALIDARG;
+
+#ifdef _AMD64_
+ if ( (width > 0xFFFFFFFF) || (height > 0xFFFFFFFF) )
+ return E_INVALIDARG;
+#endif
+
+ TexMetadata mdata2 = metadata;
+ mdata2.width = width;
+ mdata2.height = height;
+ mdata2.mipLevels = 1;
+ HRESULT hr = result.Initialize( mdata2 );
+ if ( FAILED(hr) )
+ return hr;
+
+ WICPixelFormatGUID pfGUID;
+ bool wicpf = _DXGIToWIC( metadata.format, pfGUID );
+
+ switch ( metadata.dimension )
+ {
+ case TEX_DIMENSION_TEXTURE1D:
+ case TEX_DIMENSION_TEXTURE2D:
+ assert( metadata.depth == 1 );
+
+ for( size_t item = 0; item < metadata.arraySize; ++item )
+ {
+ size_t srcIndex = metadata.ComputeIndex( 0, item, 0 );
+ if ( srcIndex >= nimages )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+
+ const Image* srcimg = &srcImages[ srcIndex ];
+ const Image* destimg = result.GetImage( 0, item, 0 );
+ if ( !srcimg || !destimg )
+ {
+ result.Release();
+ return E_POINTER;
+ }
+
+ if ( srcimg->format != metadata.format )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+
+#ifdef _AMD64_
+ if ( (srcimg->width > 0xFFFFFFFF) || (srcimg->height > 0xFFFFFFFF) )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+#endif
+
+ if ( wicpf )
+ {
+ // Case 1: Source format is supported by Windows Imaging Component
+ hr = _PerformResizeUsingWIC( *srcimg, filter, pfGUID, *destimg );
+ }
+ else
+ {
+ // Case 2: Source format is not supported by WIC, so we have to convert, resize, and convert back
+ hr = _PerformResizeViaF32( *srcimg, filter, *destimg );
+ }
+
+ if ( FAILED(hr) )
+ {
+ result.Release();
+ return hr;
+ }
+ }
+ break;
+
+ case TEX_DIMENSION_TEXTURE3D:
+ assert( metadata.arraySize == 1 );
+
+ for( size_t slice = 0; slice < metadata.depth; ++slice )
+ {
+ size_t srcIndex = metadata.ComputeIndex( 0, 0, slice );
+ if ( srcIndex >= nimages )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+
+ const Image* srcimg = &srcImages[ srcIndex ];
+ const Image* destimg = result.GetImage( 0, 0, slice );
+ if ( !srcimg || !destimg )
+ {
+ result.Release();
+ return E_POINTER;
+ }
+
+ if ( srcimg->format != metadata.format )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+
+#ifdef _AMD64_
+ if ( (srcimg->width > 0xFFFFFFFF) || (srcimg->height > 0xFFFFFFFF) )
+ {
+ result.Release();
+ return E_FAIL;
+ }
+#endif
+
+ if ( wicpf )
+ {
+ // Case 1: Source format is supported by Windows Imaging Component
+ hr = _PerformResizeUsingWIC( *srcimg, filter, pfGUID, *destimg );
+ }
+ else
+ {
+ // Case 2: Source format is not supported by WIC, so we have to convert, resize, and convert back
+ hr = _PerformResizeViaF32( *srcimg, filter, *destimg );
+ }
+
+ if ( FAILED(hr) )
+ {
+ result.Release();
+ return hr;
+ }
+ }
+ break;
+
+ default:
+ result.Release();
+ return E_FAIL;
+ }
+
+ return S_OK;
+}
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexTGA.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexTGA.cpp
new file mode 100644
index 00000000..e71575a5
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTexTGA.cpp
@@ -0,0 +1,1387 @@
+//-------------------------------------------------------------------------------------
+// DirectXTexTGA.cpp
+//
+// DirectX Texture Library - Targa Truevision (TGA) file format reader/writer
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+//
+// The implementation here has the following limitations:
+// * Does not support files that contain color maps (these are rare in practice)
+// * Interleaved files are not supported (deprecated aspect of TGA format)
+// * Only supports 8-bit greyscale; 16-, 24-, and 32-bit truecolor images
+// * Always writes uncompressed files (i.e. can read RLE compression, but does not write it)
+//
+
+enum TGAImageType
+{
+ TGA_NO_IMAGE = 0,
+ TGA_COLOR_MAPPED = 1,
+ TGA_TRUECOLOR = 2,
+ TGA_BLACK_AND_WHITE = 3,
+ TGA_COLOR_MAPPED_RLE = 9,
+ TGA_TRUECOLOR_RLE = 10,
+ TGA_BLACK_AND_WHITE_RLE = 11,
+};
+
+enum TGADescriptorFlags
+{
+ TGA_FLAGS_INVERTX = 0x10,
+ TGA_FLAGS_INVERTY = 0x20,
+ TGA_FLAGS_INTERLEAVED_2WAY = 0x40, // Deprecated
+ TGA_FLAGS_INTERLEAVED_4WAY = 0x80, // Deprecated
+};
+
+const char* g_TGA20_Signature = "TRUEVISION-XFILE.";
+
+#pragma pack(push,1)
+struct TGA_HEADER
+{
+ uint8_t bIDLength;
+ uint8_t bColorMapType;
+ uint8_t bImageType;
+ uint16_t wColorMapFirst;
+ uint16_t wColorMapLength;
+ uint8_t bColorMapSize;
+ uint16_t wXOrigin;
+ uint16_t wYOrigin;
+ uint16_t wWidth;
+ uint16_t wHeight;
+ uint8_t bBitsPerPixel;
+ uint8_t bDescriptor;
+};
+
+struct TGA_FOOTER
+{
+ uint16_t dwExtensionOffset;
+ uint16_t dwDeveloperOffset;
+ char Signature[18];
+};
+
+struct TGA_EXTENSION
+{
+ uint16_t wSize;
+ char szAuthorName[41];
+ char szAuthorComment[324];
+ uint16_t wStampMonth;
+ uint16_t wStampDay;
+ uint16_t wStampYear;
+ uint16_t wStampHour;
+ uint16_t wStampMinute;
+ uint16_t wStampSecond;
+ char szJobName[41];
+ uint16_t wJobHour;
+ uint16_t wJobMinute;
+ uint16_t wJobSecond;
+ char szSoftwareId[41];
+ uint16_t wVersionNumber;
+ uint8_t bVersionLetter;
+ uint32_t dwKeyColor;
+ uint16_t wPixelNumerator;
+ uint16_t wPixelDenominator;
+ uint16_t wGammaNumerator;
+ uint16_t wGammaDenominator;
+ uint32_t dwColorOffset;
+ uint32_t dwStampOffset;
+ uint32_t dwScanOffset;
+ uint8_t bAttributesType;
+};
+#pragma pack(pop)
+
+enum CONVERSION_FLAGS
+{
+ CONV_FLAGS_NONE = 0x0,
+ CONV_FLAGS_EXPAND = 0x1, // Conversion requires expanded pixel size
+ CONV_FLAGS_INVERTX = 0x2, // If set, scanlines are right-to-left
+ CONV_FLAGS_INVERTY = 0x4, // If set, scanlines are top-to-bottom
+ CONV_FLAGS_RLE = 0x8, // Source data is RLE compressed
+
+ CONV_FLAGS_SWIZZLE = 0x10000, // Swizzle BGR<->RGB data
+ CONV_FLAGS_888 = 0x20000, // 24bpp format
+};
+
+namespace DirectX
+{
+
+//-------------------------------------------------------------------------------------
+// Decodes TGA header
+//-------------------------------------------------------------------------------------
+static HRESULT _DecodeTGAHeader( _In_bytecount_(size) LPCVOID pSource, size_t size, _Out_ TexMetadata& metadata, size_t& offset,
+ _Inout_opt_ DWORD* convFlags )
+{
+ if ( !pSource )
+ return E_INVALIDARG;
+
+ memset( &metadata, 0, sizeof(TexMetadata) );
+
+ if ( size < sizeof(TGA_HEADER) )
+ {
+ return HRESULT_FROM_WIN32( ERROR_INVALID_DATA );
+ }
+
+ const TGA_HEADER* pHeader = reinterpret_cast<const TGA_HEADER*>( pSource );
+ assert( pHeader );
+
+ if ( pHeader->bColorMapType != 0
+ || pHeader->wColorMapLength != 0 )
+ {
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ if ( pHeader->bDescriptor & (TGA_FLAGS_INTERLEAVED_2WAY|TGA_FLAGS_INTERLEAVED_4WAY) )
+ {
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ if ( !pHeader->wWidth || !pHeader->wHeight )
+ {
+ return HRESULT_FROM_WIN32( ERROR_INVALID_DATA );
+ }
+
+ switch ( pHeader->bImageType )
+ {
+ case TGA_TRUECOLOR:
+ case TGA_TRUECOLOR_RLE:
+ switch( pHeader->bBitsPerPixel )
+ {
+ case 16:
+ metadata.format = DXGI_FORMAT_B5G5R5A1_UNORM;
+ break;
+
+ case 24:
+ metadata.format = DXGI_FORMAT_R8G8B8A8_UNORM;
+ if ( convFlags )
+ *convFlags |= CONV_FLAGS_EXPAND;
+ // We could use DXGI_FORMAT_B8G8R8X8_UNORM, but we prefer DXGI 1.0 formats
+ break;
+
+ case 32:
+ metadata.format = DXGI_FORMAT_R8G8B8A8_UNORM;
+ // We could use DXGI_FORMAT_B8G8R8A8_UNORM, but we prefer DXGI 1.0 formats
+ break;
+ }
+
+ if ( convFlags && (pHeader->bImageType == TGA_TRUECOLOR_RLE) )
+ {
+ *convFlags |= CONV_FLAGS_RLE;
+ }
+ break;
+
+ case TGA_BLACK_AND_WHITE:
+ case TGA_BLACK_AND_WHITE_RLE:
+ switch( pHeader->bBitsPerPixel )
+ {
+ case 8:
+ metadata.format = DXGI_FORMAT_R8_UNORM;
+ break;
+
+ default:
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ if ( convFlags && (pHeader->bImageType == TGA_BLACK_AND_WHITE_RLE) )
+ {
+ *convFlags |= CONV_FLAGS_RLE;
+ }
+ break;
+
+ case TGA_NO_IMAGE:
+ case TGA_COLOR_MAPPED:
+ case TGA_COLOR_MAPPED_RLE:
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+
+ default:
+ return HRESULT_FROM_WIN32( ERROR_INVALID_DATA );
+ }
+
+ metadata.width = pHeader->wWidth;
+ metadata.height = pHeader->wHeight;
+ metadata.depth = metadata.arraySize = metadata.mipLevels = 1;
+ metadata.dimension = TEX_DIMENSION_TEXTURE2D;
+
+ if ( convFlags )
+ {
+ if ( pHeader->bDescriptor & TGA_FLAGS_INVERTX )
+ *convFlags |= CONV_FLAGS_INVERTX;
+
+ if ( pHeader->bDescriptor & TGA_FLAGS_INVERTY )
+ *convFlags |= CONV_FLAGS_INVERTY;
+ }
+
+ offset = sizeof( TGA_HEADER );
+
+ if ( pHeader->bIDLength != 0 )
+ {
+ offset += pHeader->bIDLength;
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Set alpha for images with all 0 alpha channel
+//-------------------------------------------------------------------------------------
+static HRESULT _SetAlphaChannelToOpaque( _In_ const Image* image )
+{
+ assert( image );
+
+ uint8_t* pPixels = reinterpret_cast<uint8_t*>( image->pixels );
+ if ( !pPixels )
+ return E_POINTER;
+
+ for( size_t y = 0; y < image->height; ++y )
+ {
+ _CopyScanline( pPixels, image->rowPitch, pPixels, image->rowPitch, image->format, TEXP_SCANLINE_SETALPHA );
+ pPixels += image->rowPitch;
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Uncompress pixel data from a TGA into the target image
+//-------------------------------------------------------------------------------------
+static HRESULT _UncompressPixels( _In_bytecount_(size) LPCVOID pSource, size_t size, _In_ const Image* image, DWORD convFlags )
+{
+ assert( pSource && size > 0 );
+
+ if ( !image || !image->pixels )
+ return E_POINTER;
+
+ // Compute TGA image data pitch
+ size_t rowPitch;
+ if ( convFlags & CONV_FLAGS_EXPAND )
+ {
+ rowPitch = image->width * 3;
+ }
+ else
+ {
+ size_t slicePitch;
+ ComputePitch( image->format, image->width, image->height, rowPitch, slicePitch, CP_FLAGS_NONE );
+ }
+
+ const uint8_t* sPtr = reinterpret_cast<const uint8_t*>( pSource );
+ const uint8_t* endPtr = sPtr + size;
+
+ switch( image->format )
+ {
+ //--------------------------------------------------------------------------- 8-bit
+ case DXGI_FORMAT_R8_UNORM:
+ for( size_t y=0; y < image->height; ++y )
+ {
+ size_t offset = ( (convFlags & CONV_FLAGS_INVERTX ) ? (image->width - 1) : 0 );
+ assert( offset < rowPitch);
+
+ uint8_t* dPtr = reinterpret_cast<uint8_t*>( image->pixels )
+ + ( image->rowPitch * ( (convFlags & CONV_FLAGS_INVERTY) ? y : (image->height - y - 1) ) )
+ + offset;
+
+ for( size_t x=0; x < image->width; )
+ {
+ if ( sPtr >= endPtr )
+ return E_FAIL;
+
+ if ( *sPtr & 0x80 )
+ {
+ // Repeat
+ size_t j = (*sPtr & 0x7F) + 1;
+ if ( ++sPtr >= endPtr )
+ return E_FAIL;
+
+ for( ; j > 0; --j, ++x )
+ {
+ if ( x >= image->width )
+ return E_FAIL;
+
+ *dPtr = *sPtr;
+
+ if ( convFlags & CONV_FLAGS_INVERTX )
+ --dPtr;
+ else
+ ++dPtr;
+ }
+
+ ++sPtr;
+ }
+ else
+ {
+ // Literal
+ size_t j = (*sPtr & 0x7F) + 1;
+ ++sPtr;
+
+ if ( sPtr+j > endPtr )
+ return E_FAIL;
+
+ for( ; j > 0; --j, ++x )
+ {
+ if ( x >= image->width )
+ return E_FAIL;
+
+ *dPtr = *(sPtr++);
+
+ if ( convFlags & CONV_FLAGS_INVERTX )
+ --dPtr;
+ else
+ ++dPtr;
+ }
+ }
+ }
+ }
+ break;
+
+ //-------------------------------------------------------------------------- 16-bit
+ case DXGI_FORMAT_B5G5R5A1_UNORM:
+ {
+ bool nonzeroa = false;
+ for( size_t y=0; y < image->height; ++y )
+ {
+ size_t offset = ( (convFlags & CONV_FLAGS_INVERTX ) ? (image->width - 1) : 0 );
+ assert( offset*2 < rowPitch);
+
+ uint16_t* dPtr = reinterpret_cast<uint16_t*>( reinterpret_cast<uint8_t*>( image->pixels )
+ + ( image->rowPitch * ( (convFlags & CONV_FLAGS_INVERTY) ? y : (image->height - y - 1) ) ) )
+ + offset;
+
+ for( size_t x=0; x < image->width; )
+ {
+ if ( sPtr >= endPtr )
+ return E_FAIL;
+
+ if ( *sPtr & 0x80 )
+ {
+ // Repeat
+ size_t j = (*sPtr & 0x7F) + 1;
+ ++sPtr;
+
+ if ( sPtr+1 >= endPtr )
+ return E_FAIL;
+
+ uint16_t t = *sPtr | (*(sPtr+1) << 8);
+ if ( t & 0x8000 )
+ nonzeroa = true;
+ sPtr += 2;
+
+ for( ; j > 0; --j, ++x )
+ {
+ if ( x >= image->width )
+ return E_FAIL;
+
+ *dPtr = t;
+
+ if ( convFlags & CONV_FLAGS_INVERTX )
+ --dPtr;
+ else
+ ++dPtr;
+ }
+ }
+ else
+ {
+ // Literal
+ size_t j = (*sPtr & 0x7F) + 1;
+ ++sPtr;
+
+ if ( sPtr+(j*2) > endPtr )
+ return E_FAIL;
+
+ for( ; j > 0; --j, ++x )
+ {
+ if ( x >= image->width )
+ return E_FAIL;
+
+ uint16_t t = *sPtr | (*(sPtr+1) << 8);
+ if ( t & 0x8000 )
+ nonzeroa = true;
+ sPtr += 2;
+ *dPtr = t;
+
+ if ( convFlags & CONV_FLAGS_INVERTX )
+ --dPtr;
+ else
+ ++dPtr;
+ }
+ }
+ }
+ }
+
+ // If there are no non-zero alpha channel entries, we'll assume alpha is not used and force it to opaque
+ if ( !nonzeroa )
+ {
+ HRESULT hr = _SetAlphaChannelToOpaque( image );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ }
+ break;
+
+ //----------------------------------------------------------------------- 24/32-bit
+ case DXGI_FORMAT_R8G8B8A8_UNORM:
+ {
+ bool nonzeroa = false;
+ for( size_t y=0; y < image->height; ++y )
+ {
+ size_t offset = ( (convFlags & CONV_FLAGS_INVERTX ) ? (image->width - 1) : 0 );
+
+ uint32_t* dPtr = reinterpret_cast<uint32_t*>( reinterpret_cast<uint8_t*>( image->pixels )
+ + ( image->rowPitch * ( (convFlags & CONV_FLAGS_INVERTY) ? y : (image->height - y - 1) ) ) )
+ + offset;
+
+ for( size_t x=0; x < image->width; )
+ {
+ if ( sPtr >= endPtr )
+ return E_FAIL;
+
+ if ( *sPtr & 0x80 )
+ {
+ // Repeat
+ size_t j = (*sPtr & 0x7F) + 1;
+ ++sPtr;
+
+ DWORD t;
+ if ( convFlags & CONV_FLAGS_EXPAND )
+ {
+ assert( offset*3 < rowPitch);
+
+ if ( sPtr+2 >= endPtr )
+ return E_FAIL;
+
+ // BGR -> RGBA
+ t = ( *sPtr << 16 ) | ( *(sPtr+1) << 8 ) | ( *(sPtr+2) ) | 0xFF000000;
+ sPtr += 3;
+
+ nonzeroa = true;
+ }
+ else
+ {
+ assert( offset*4 < rowPitch);
+
+ if ( sPtr+3 >= endPtr )
+ return E_FAIL;
+
+ // BGRA -> RGBA
+ t = ( *sPtr << 16 ) | ( *(sPtr+1) << 8 ) | ( *(sPtr+2) ) | ( *(sPtr+3) << 24 );
+
+ if ( *(sPtr+3) > 0 )
+ nonzeroa = true;
+
+ sPtr += 4;
+ }
+
+ for( ; j > 0; --j, ++x )
+ {
+ if ( x >= image->width )
+ return E_FAIL;
+
+ *dPtr = t;
+
+ if ( convFlags & CONV_FLAGS_INVERTX )
+ --dPtr;
+ else
+ ++dPtr;
+ }
+ }
+ else
+ {
+ // Literal
+ size_t j = (*sPtr & 0x7F) + 1;
+ ++sPtr;
+
+ if ( convFlags & CONV_FLAGS_EXPAND )
+ {
+ if ( sPtr+(j*3) > endPtr )
+ return E_FAIL;
+ }
+ else
+ {
+ if ( sPtr+(j*4) > endPtr )
+ return E_FAIL;
+ }
+
+ for( ; j > 0; --j, ++x )
+ {
+ if ( x >= image->width )
+ return E_FAIL;
+
+ if ( convFlags & CONV_FLAGS_EXPAND )
+ {
+ assert( offset*3 < rowPitch);
+
+ if ( sPtr+2 >= endPtr )
+ return E_FAIL;
+
+ // BGR -> RGBA
+ *dPtr = ( *sPtr << 16 ) | ( *(sPtr+1) << 8 ) | ( *(sPtr+2) ) | 0xFF000000;
+ sPtr += 3;
+
+ nonzeroa = true;
+ }
+ else
+ {
+ assert( offset*4 < rowPitch);
+
+ if ( sPtr+3 >= endPtr )
+ return E_FAIL;
+
+ // BGRA -> RGBA
+ *dPtr = ( *sPtr << 16 ) | ( *(sPtr+1) << 8 ) | ( *(sPtr+2) ) | ( *(sPtr+3) << 24 );
+
+ if ( *(sPtr+3) > 0 )
+ nonzeroa = true;
+
+ sPtr += 4;
+ }
+
+ if ( convFlags & CONV_FLAGS_INVERTX )
+ --dPtr;
+ else
+ ++dPtr;
+ }
+ }
+ }
+ }
+
+ // If there are no non-zero alpha channel entries, we'll assume alpha is not used and force it to opaque
+ if ( !nonzeroa )
+ {
+ HRESULT hr = _SetAlphaChannelToOpaque( image );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ }
+ break;
+
+ //---------------------------------------------------------------------------------
+ default:
+ return E_FAIL;
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Copies pixel data from a TGA into the target image
+//-------------------------------------------------------------------------------------
+static HRESULT _CopyPixels( _In_bytecount_(size) LPCVOID pSource, size_t size, _In_ const Image* image, DWORD convFlags )
+{
+ assert( pSource && size > 0 );
+
+ if ( !image || !image->pixels )
+ return E_POINTER;
+
+ // Compute TGA image data pitch
+ size_t rowPitch;
+ if ( convFlags & CONV_FLAGS_EXPAND )
+ {
+ rowPitch = image->width * 3;
+ }
+ else
+ {
+ size_t slicePitch;
+ ComputePitch( image->format, image->width, image->height, rowPitch, slicePitch, CP_FLAGS_NONE );
+ }
+
+ const uint8_t* sPtr = reinterpret_cast<const uint8_t*>( pSource );
+ const uint8_t* endPtr = sPtr + size;
+
+ switch( image->format )
+ {
+ //--------------------------------------------------------------------------- 8-bit
+ case DXGI_FORMAT_R8_UNORM:
+ for( size_t y=0; y < image->height; ++y )
+ {
+ size_t offset = ( (convFlags & CONV_FLAGS_INVERTX ) ? (image->width - 1) : 0 );
+ assert( offset < rowPitch);
+
+ uint8_t* dPtr = reinterpret_cast<uint8_t*>( image->pixels )
+ + ( image->rowPitch * ( (convFlags & CONV_FLAGS_INVERTY) ? y : (image->height - y - 1) ) )
+ + offset;
+
+ for( size_t x=0; x < image->width; ++x )
+ {
+ if ( sPtr >= endPtr )
+ return E_FAIL;
+
+ *dPtr = *(sPtr++);
+
+ if ( convFlags & CONV_FLAGS_INVERTX )
+ --dPtr;
+ else
+ ++dPtr;
+ }
+ }
+ break;
+
+ //-------------------------------------------------------------------------- 16-bit
+ case DXGI_FORMAT_B5G5R5A1_UNORM:
+ {
+ bool nonzeroa = false;
+ for( size_t y=0; y < image->height; ++y )
+ {
+ size_t offset = ( (convFlags & CONV_FLAGS_INVERTX ) ? (image->width - 1) : 0 );
+ assert( offset*2 < rowPitch);
+
+ uint16_t* dPtr = reinterpret_cast<uint16_t*>( reinterpret_cast<uint8_t*>( image->pixels )
+ + ( image->rowPitch * ( (convFlags & CONV_FLAGS_INVERTY) ? y : (image->height - y - 1) ) ) )
+ + offset;
+
+ for( size_t x=0; x < image->width; ++x )
+ {
+ if ( sPtr+1 >= endPtr )
+ return E_FAIL;
+
+ uint16_t t = *sPtr | (*(sPtr+1) << 8);
+ sPtr += 2;
+ *dPtr = t;
+
+ if ( t & 0x8000 )
+ nonzeroa = true;
+
+ if ( convFlags & CONV_FLAGS_INVERTX )
+ --dPtr;
+ else
+ ++dPtr;
+ }
+ }
+
+ // If there are no non-zero alpha channel entries, we'll assume alpha is not used and force it to opaque
+ if ( !nonzeroa )
+ {
+ HRESULT hr = _SetAlphaChannelToOpaque( image );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ }
+ break;
+
+ //----------------------------------------------------------------------- 24/32-bit
+ case DXGI_FORMAT_R8G8B8A8_UNORM:
+ {
+ bool nonzeroa = false;
+ for( size_t y=0; y < image->height; ++y )
+ {
+ size_t offset = ( (convFlags & CONV_FLAGS_INVERTX ) ? (image->width - 1) : 0 );
+
+ uint32_t* dPtr = reinterpret_cast<uint32_t*>( reinterpret_cast<uint8_t*>( image->pixels )
+ + ( image->rowPitch * ( (convFlags & CONV_FLAGS_INVERTY) ? y : (image->height - y - 1) ) ) )
+ + offset;
+
+ for( size_t x=0; x < image->width; ++x )
+ {
+ if ( convFlags & CONV_FLAGS_EXPAND )
+ {
+ assert( offset*3 < rowPitch);
+
+ if ( sPtr+2 >= endPtr )
+ return E_FAIL;
+
+ // BGR -> RGBA
+ *dPtr = ( *sPtr << 16 ) | ( *(sPtr+1) << 8 ) | ( *(sPtr+2) ) | 0xFF000000;
+ sPtr += 3;
+
+ nonzeroa = true;
+ }
+ else
+ {
+ assert( offset*4 < rowPitch);
+
+ if ( sPtr+3 >= endPtr )
+ return E_FAIL;
+
+ // BGRA -> RGBA
+ *dPtr = ( *sPtr << 16 ) | ( *(sPtr+1) << 8 ) | ( *(sPtr+2) ) | ( *(sPtr+3) << 24 );
+
+ if ( *(sPtr+3) > 0 )
+ nonzeroa = true;
+
+ sPtr += 4;
+ }
+
+ if ( convFlags & CONV_FLAGS_INVERTX )
+ --dPtr;
+ else
+ ++dPtr;
+ }
+ }
+
+ // If there are no non-zero alpha channel entries, we'll assume alpha is not used and force it to opaque
+ if ( !nonzeroa )
+ {
+ HRESULT hr = _SetAlphaChannelToOpaque( image );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ }
+ break;
+
+ //---------------------------------------------------------------------------------
+ default:
+ return E_FAIL;
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Encodes TGA file header
+//-------------------------------------------------------------------------------------
+static HRESULT _EncodeTGAHeader( _In_ const Image& image, _Out_ TGA_HEADER& header, DWORD& convFlags )
+{
+ assert( IsValid( image.format ) && !IsVideo( image.format ) );
+
+ memset( &header, 0, sizeof(TGA_HEADER) );
+
+ if ( (image.width > 0xFFFF)
+ || (image.height > 0xFFFF) )
+ {
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ header.wWidth = static_cast<uint16_t>( image.width );
+ header.wHeight = static_cast<uint16_t>( image.height );
+
+ switch( image.format )
+ {
+ case DXGI_FORMAT_R8G8B8A8_UNORM:
+ case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
+ header.bImageType = TGA_TRUECOLOR;
+ header.bBitsPerPixel = 32;
+ header.bDescriptor = TGA_FLAGS_INVERTY | 8;
+ convFlags |= CONV_FLAGS_SWIZZLE;
+ break;
+
+ case DXGI_FORMAT_B8G8R8A8_UNORM:
+ case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+ header.bImageType = TGA_TRUECOLOR;
+ header.bBitsPerPixel = 32;
+ header.bDescriptor = TGA_FLAGS_INVERTY | 8;
+ break;
+
+ case DXGI_FORMAT_B8G8R8X8_UNORM:
+ case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
+ header.bImageType = TGA_TRUECOLOR;
+ header.bBitsPerPixel = 24;
+ header.bDescriptor = TGA_FLAGS_INVERTY;
+ convFlags |= CONV_FLAGS_888;
+ break;
+
+ case DXGI_FORMAT_R8_UNORM:
+ case DXGI_FORMAT_A8_UNORM:
+ header.bImageType = TGA_BLACK_AND_WHITE;
+ header.bBitsPerPixel = 8;
+ header.bDescriptor = TGA_FLAGS_INVERTY;
+ break;
+
+ case DXGI_FORMAT_B5G5R5A1_UNORM:
+ header.bImageType = TGA_TRUECOLOR;
+ header.bBitsPerPixel = 16;
+ header.bDescriptor = TGA_FLAGS_INVERTY | 1;
+ break;
+
+ default:
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Copies BGRX data to form BGR 24bpp data
+//-------------------------------------------------------------------------------------
+#pragma warning(suppress: 6001 6101) // In the case where outSize is insufficient we do not write to pDestination
+static void _Copy24bppScanline( _Out_bytecap_(outSize) LPVOID pDestination, _In_ size_t outSize,
+ _In_bytecount_(inSize) LPCVOID pSource, _In_ size_t inSize )
+{
+ assert( pDestination && outSize > 0 );
+ assert( pSource && inSize > 0 );
+
+ assert( pDestination != pSource );
+
+ const uint32_t * __restrict sPtr = reinterpret_cast<const uint32_t*>(pSource);
+ uint8_t * __restrict dPtr = reinterpret_cast<uint8_t*>(pDestination);
+
+ const uint8_t* endPtr = dPtr + outSize;
+
+ for( size_t count = 0; count < inSize; count += 4 )
+ {
+ uint32_t t = *(sPtr++);
+
+ if ( dPtr+2 > endPtr )
+ return;
+
+ *(dPtr++) = uint8_t(t & 0xFF); // Blue
+ *(dPtr++) = uint8_t((t & 0xFF00) >> 8); // Green
+ *(dPtr++) = uint8_t((t & 0xFF0000) >> 16); // Red
+ }
+}
+
+
+//=====================================================================================
+// Entry-points
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// Obtain metadata from TGA file in memory/on disk
+//-------------------------------------------------------------------------------------
+HRESULT GetMetadataFromTGAMemory( LPCVOID pSource, size_t size, TexMetadata& metadata )
+{
+ if ( !pSource || size == 0 )
+ return E_INVALIDARG;
+
+ size_t offset;
+ return _DecodeTGAHeader( pSource, size, metadata, offset, 0 );
+}
+
+HRESULT GetMetadataFromTGAFile( LPCWSTR szFile, TexMetadata& metadata )
+{
+ if ( !szFile )
+ return E_INVALIDARG;
+
+#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/)
+ ScopedHandle hFile( safe_handle( CreateFile2( szFile, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, 0 ) ) );
+#else
+ ScopedHandle hFile( safe_handle( CreateFileW( szFile, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING,
+ FILE_FLAG_SEQUENTIAL_SCAN, 0 ) ) );
+#endif
+ if ( !hFile )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ // Get the file size
+ LARGE_INTEGER fileSize = {0};
+
+#if (_WIN32_WINNT >= _WIN32_WINNT_VISTA)
+ FILE_STANDARD_INFO fileInfo;
+ if ( !GetFileInformationByHandleEx( hFile.get(), FileStandardInfo, &fileInfo, sizeof(fileInfo) ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+ fileSize = fileInfo.EndOfFile;
+#else
+ if ( !GetFileSizeEx( hFile.get(), &fileSize ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+#endif
+
+ // File is too big for 32-bit allocation, so reject read (4 GB should be plenty large enough for a valid TGA file)
+ if ( fileSize.HighPart > 0 )
+ {
+ return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE );
+ }
+
+ // Need at least enough data to fill the standard header to be a valid TGA
+ if ( fileSize.LowPart < ( sizeof(TGA_HEADER) ) )
+ {
+ return E_FAIL;
+ }
+
+ // Read the standard header (we don't need the file footer to parse the file)
+ uint8_t header[sizeof(TGA_HEADER)];
+ DWORD bytesRead = 0;
+ if ( !ReadFile( hFile.get(), header, sizeof(TGA_HEADER), &bytesRead, 0 ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ size_t offset;
+ return _DecodeTGAHeader( header, bytesRead, metadata, offset, 0 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Load a TGA file in memory
+//-------------------------------------------------------------------------------------
+HRESULT LoadFromTGAMemory( LPCVOID pSource, size_t size, TexMetadata* metadata, ScratchImage& image )
+{
+ if ( !pSource || size == 0 )
+ return E_INVALIDARG;
+
+ image.Release();
+
+ size_t offset;
+ DWORD convFlags = 0;
+ TexMetadata mdata;
+ HRESULT hr = _DecodeTGAHeader( pSource, size, mdata, offset, &convFlags );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( offset > size )
+ return E_FAIL;
+
+ LPCVOID pPixels = reinterpret_cast<LPCVOID>( reinterpret_cast<const uint8_t*>(pSource) + offset );
+ assert( pPixels );
+
+ size_t remaining = size - offset;
+ if ( remaining == 0 )
+ return E_FAIL;
+
+ hr = image.Initialize2D( mdata.format, mdata.width, mdata.height, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( convFlags & CONV_FLAGS_RLE )
+ {
+ hr = _UncompressPixels( pPixels, remaining, image.GetImage(0,0,0), convFlags );
+ }
+ else
+ {
+ hr = _CopyPixels( pPixels, remaining, image.GetImage(0,0,0), convFlags );
+ }
+
+ if ( FAILED(hr) )
+ {
+ image.Release();
+ return hr;
+ }
+
+ if ( metadata )
+ memcpy( metadata, &mdata, sizeof(TexMetadata) );
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Load a TGA file from disk
+//-------------------------------------------------------------------------------------
+HRESULT LoadFromTGAFile( LPCWSTR szFile, TexMetadata* metadata, ScratchImage& image )
+{
+ if ( !szFile )
+ return E_INVALIDARG;
+
+ image.Release();
+
+#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/)
+ ScopedHandle hFile( safe_handle( CreateFile2( szFile, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, 0 ) ) );
+#else
+ ScopedHandle hFile( safe_handle( CreateFileW( szFile, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING,
+ FILE_FLAG_SEQUENTIAL_SCAN, 0 ) ) );
+#endif
+ if ( !hFile )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ // Get the file size
+ LARGE_INTEGER fileSize = {0};
+
+#if (_WIN32_WINNT >= _WIN32_WINNT_VISTA)
+ FILE_STANDARD_INFO fileInfo;
+ if ( !GetFileInformationByHandleEx( hFile.get(), FileStandardInfo, &fileInfo, sizeof(fileInfo) ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+ fileSize = fileInfo.EndOfFile;
+#else
+ if ( !GetFileSizeEx( hFile.get(), &fileSize ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+#endif
+
+ // File is too big for 32-bit allocation, so reject read (4 GB should be plenty large enough for a valid TGA file)
+ if ( fileSize.HighPart > 0 )
+ {
+ return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE );
+ }
+
+ // Need at least enough data to fill the header to be a valid TGA
+ if ( fileSize.LowPart < sizeof(TGA_HEADER) )
+ {
+ return E_FAIL;
+ }
+
+ // Read the header
+ uint8_t header[sizeof(TGA_HEADER)];
+ DWORD bytesRead = 0;
+ if ( !ReadFile( hFile.get(), header, sizeof(TGA_HEADER), &bytesRead, 0 ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ size_t offset;
+ DWORD convFlags = 0;
+ TexMetadata mdata;
+ HRESULT hr = _DecodeTGAHeader( header, bytesRead, mdata, offset, &convFlags );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Read the pixels
+ DWORD remaining = static_cast<DWORD>( fileSize.LowPart - offset );
+ if ( remaining == 0 )
+ return E_FAIL;
+
+ if ( offset > sizeof(TGA_HEADER) )
+ {
+ // Skip past the id string
+ LARGE_INTEGER filePos = { static_cast<DWORD>(offset), 0 };
+ if ( !SetFilePointerEx( hFile.get(), filePos, 0, FILE_BEGIN ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+ }
+
+ hr = image.Initialize2D( mdata.format, mdata.width, mdata.height, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ assert( image.GetPixels() );
+
+ if ( !(convFlags & (CONV_FLAGS_RLE | CONV_FLAGS_EXPAND | CONV_FLAGS_INVERTX)) && (convFlags & CONV_FLAGS_INVERTY) )
+ {
+ // This case we can read directly into the image buffer in place
+ if ( !ReadFile( hFile.get(), image.GetPixels(), static_cast<DWORD>( image.GetPixelsSize() ), &bytesRead, 0 ) )
+ {
+ image.Release();
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ if ( bytesRead != image.GetPixelsSize() )
+ {
+ image.Release();
+ return E_FAIL;
+ }
+
+ switch( mdata.format )
+ {
+ case DXGI_FORMAT_R8G8B8A8_UNORM:
+ {
+ // TGA stores 32-bit data in BGRA form, need to swizzle to RGBA
+ assert( image.GetImageCount() == 1 );
+ const Image* img = image.GetImage(0,0,0);
+ if ( !img )
+ return E_POINTER;
+
+ uint8_t *pPixels = img->pixels;
+ if ( !pPixels )
+ return E_POINTER;
+
+ size_t rowPitch = img->rowPitch;
+
+ // Scan for non-zero alpha channel
+ bool nonzeroa = false;
+
+ for( size_t h = 0; h < img->height; ++h )
+ {
+ const uint32_t* sPtr = reinterpret_cast<const uint32_t*>( pPixels );
+
+ for( size_t x=0; x < img->width; ++x )
+ {
+ if ( (*sPtr) & 0xff000000 )
+ {
+ nonzeroa = true;
+ break;
+ }
+
+ ++sPtr;
+ }
+
+ if ( nonzeroa )
+ break;
+
+ pPixels += rowPitch;
+ }
+
+ DWORD tflags = ( !nonzeroa ) ? TEXP_SCANLINE_SETALPHA : TEXP_SCANLINE_NONE;
+
+ // Swizzle scanlines
+ pPixels = img->pixels;
+
+ for( size_t h = 0; h < img->height; ++h )
+ {
+ _SwizzleScanline( pPixels, rowPitch, pPixels, rowPitch, mdata.format, tflags );
+
+ pPixels += rowPitch;
+ }
+ }
+ break;
+
+ // If we start using DXGI_FORMAT_B8G8R8X8_UNORM or DXGI_FORMAT_B8G8R8A8_UNORM we need to check for a fully 0 alpha channel
+
+ case DXGI_FORMAT_B5G5R5A1_UNORM:
+ {
+ assert( image.GetImageCount() == 1 );
+ const Image* img = image.GetImage(0,0,0);
+ if ( !img )
+ return E_POINTER;
+
+ // Scan for non-zero alpha channel
+ bool nonzeroa = false;
+
+ const uint8_t *pPixels = img->pixels;
+ if ( !pPixels )
+ return E_POINTER;
+
+ size_t rowPitch = img->rowPitch;
+
+ for( size_t h = 0; h < img->height; ++h )
+ {
+ const uint16_t* sPtr = reinterpret_cast<const uint16_t*>( pPixels );
+
+ for( size_t x=0; x < img->width; ++x )
+ {
+ if ( *sPtr & 0x8000 )
+ {
+ nonzeroa = true;
+ break;
+ }
+
+ ++sPtr;
+ }
+
+ if ( nonzeroa )
+ break;
+
+ pPixels += rowPitch;
+ }
+
+ // If there are no non-zero alpha channel entries, we'll assume alpha is not used and force it to opaque
+ if ( !nonzeroa )
+ {
+ hr = _SetAlphaChannelToOpaque( img );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ }
+ break;
+ }
+ }
+ else // RLE || EXPAND || INVERTX || !INVERTY
+ {
+ std::unique_ptr<uint8_t[]> temp( new uint8_t[ remaining ] );
+ if ( !temp )
+ {
+ image.Release();
+ return E_OUTOFMEMORY;
+ }
+
+ if ( !ReadFile( hFile.get(), temp.get(), remaining, &bytesRead, 0 ) )
+ {
+ image.Release();
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ if ( bytesRead != remaining )
+ {
+ image.Release();
+ return E_FAIL;
+ }
+
+ if ( convFlags & CONV_FLAGS_RLE )
+ {
+ hr = _UncompressPixels( temp.get(), remaining, image.GetImage(0,0,0), convFlags );
+ }
+ else
+ {
+ hr = _CopyPixels( temp.get(), remaining, image.GetImage(0,0,0), convFlags );
+ }
+
+ if ( FAILED(hr) )
+ {
+ image.Release();
+ return hr;
+ }
+ }
+
+ if ( metadata )
+ memcpy( metadata, &mdata, sizeof(TexMetadata) );
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Save a TGA file to memory
+//-------------------------------------------------------------------------------------
+HRESULT SaveToTGAMemory( const Image& image, Blob& blob )
+{
+ if ( !image.pixels )
+ return E_POINTER;
+
+ TGA_HEADER tga_header;
+ DWORD convFlags = 0;
+ HRESULT hr = _EncodeTGAHeader( image, tga_header, convFlags );
+ if ( FAILED(hr) )
+ return hr;
+
+ blob.Release();
+
+ // Determine memory required for image data
+ size_t rowPitch, slicePitch;
+ if ( convFlags & CONV_FLAGS_888 )
+ {
+ rowPitch = image.width * 3;
+ slicePitch = image.height * rowPitch;
+ }
+ else
+ {
+ ComputePitch( image.format, image.width, image.height, rowPitch, slicePitch, CP_FLAGS_NONE );
+ }
+
+ hr = blob.Initialize( sizeof(TGA_HEADER) + slicePitch );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Copy header
+ uint8_t* dPtr = reinterpret_cast<uint8_t*>( blob.GetBufferPointer() );
+ assert( dPtr != 0 );
+ memcpy_s( dPtr, blob.GetBufferSize(), &tga_header, sizeof(TGA_HEADER) );
+ dPtr += sizeof(TGA_HEADER);
+
+ const uint8_t* pPixels = reinterpret_cast<const uint8_t*>( image.pixels );
+ assert( pPixels );
+
+ for( size_t y = 0; y < image.height; ++y )
+ {
+ // Copy pixels
+ if ( convFlags & CONV_FLAGS_888 )
+ {
+ _Copy24bppScanline( dPtr, rowPitch, pPixels, image.rowPitch );
+ }
+ else if ( convFlags & CONV_FLAGS_SWIZZLE )
+ {
+ _SwizzleScanline( dPtr, rowPitch, pPixels, image.rowPitch, image.format, TEXP_SCANLINE_NONE );
+ }
+ else
+ {
+ _CopyScanline( dPtr, rowPitch, pPixels, image.rowPitch, image.format, TEXP_SCANLINE_NONE );
+ }
+
+ dPtr += rowPitch;
+ pPixels += image.rowPitch;
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Save a TGA file to disk
+//-------------------------------------------------------------------------------------
+HRESULT SaveToTGAFile( const Image& image, LPCWSTR szFile )
+{
+ if ( !szFile )
+ return E_INVALIDARG;
+
+ if ( !image.pixels )
+ return E_POINTER;
+
+ TGA_HEADER tga_header;
+ DWORD convFlags = 0;
+ HRESULT hr = _EncodeTGAHeader( image, tga_header, convFlags );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Create file and write header
+#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/)
+ ScopedHandle hFile( safe_handle( CreateFile2( szFile, GENERIC_WRITE, 0, CREATE_ALWAYS, 0 ) ) );
+#else
+ ScopedHandle hFile( safe_handle( CreateFileW( szFile, GENERIC_WRITE, 0, 0, CREATE_ALWAYS, 0, 0 ) ) );
+#endif
+ if ( !hFile )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ // Determine size for TGA pixel data
+ size_t rowPitch, slicePitch;
+ if ( convFlags & CONV_FLAGS_888 )
+ {
+ rowPitch = image.width * 3;
+ slicePitch = image.height * rowPitch;
+ }
+ else
+ {
+ ComputePitch( image.format, image.width, image.height, rowPitch, slicePitch, CP_FLAGS_NONE );
+ }
+
+ if ( slicePitch < 65535 )
+ {
+ // For small images, it is better to create an in-memory file and write it out
+ Blob blob;
+
+ hr = SaveToTGAMemory( image, blob );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Write blob
+ const DWORD bytesToWrite = static_cast<DWORD>( blob.GetBufferSize() );
+ DWORD bytesWritten;
+ if ( !WriteFile( hFile.get(), blob.GetBufferPointer(), bytesToWrite,
+ &bytesWritten, 0 ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ if ( bytesWritten != bytesToWrite )
+ {
+ return E_FAIL;
+ }
+ }
+ else
+ {
+ // Otherwise, write the image one scanline at a time...
+ std::unique_ptr<uint8_t[]> temp( new uint8_t[ rowPitch ] );
+ if ( !temp )
+ return E_OUTOFMEMORY;
+
+ // Write header
+ DWORD bytesWritten;
+ if ( !WriteFile( hFile.get(), &tga_header, sizeof(TGA_HEADER), &bytesWritten, 0 ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ if ( bytesWritten != sizeof(TGA_HEADER) )
+ return E_FAIL;
+
+ // Write pixels
+ const uint8_t* pPixels = reinterpret_cast<const uint8_t*>( image.pixels );
+
+ for( size_t y = 0; y < image.height; ++y )
+ {
+ // Copy pixels
+ if ( convFlags & CONV_FLAGS_888 )
+ {
+ _Copy24bppScanline( temp.get(), rowPitch, pPixels, image.rowPitch );
+ }
+ else if ( convFlags & CONV_FLAGS_SWIZZLE )
+ {
+ _SwizzleScanline( temp.get(), rowPitch, pPixels, image.rowPitch, image.format, TEXP_SCANLINE_NONE );
+ }
+ else
+ {
+ _CopyScanline( temp.get(), rowPitch, pPixels, image.rowPitch, image.format, TEXP_SCANLINE_NONE );
+ }
+
+ pPixels += image.rowPitch;
+
+ if ( !WriteFile( hFile.get(), temp.get(), static_cast<DWORD>( rowPitch ), &bytesWritten, 0 ) )
+ {
+ return HRESULT_FROM_WIN32( GetLastError() );
+ }
+
+ if ( bytesWritten != rowPitch )
+ return E_FAIL;
+ }
+ }
+
+ return S_OK;
+}
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexUtil.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexUtil.cpp
new file mode 100644
index 00000000..9d5e0905
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTexUtil.cpp
@@ -0,0 +1,759 @@
+//-------------------------------------------------------------------------------------
+// DirectXTexUtil.cpp
+//
+// DirectX Texture Library - Utilities
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+//-------------------------------------------------------------------------------------
+// WIC Pixel Format Translation Data
+//-------------------------------------------------------------------------------------
+struct WICTranslate
+{
+ GUID wic;
+ DXGI_FORMAT format;
+};
+
+static WICTranslate g_WICFormats[] =
+{
+ { GUID_WICPixelFormat128bppRGBAFloat, DXGI_FORMAT_R32G32B32A32_FLOAT },
+
+ { GUID_WICPixelFormat64bppRGBAHalf, DXGI_FORMAT_R16G16B16A16_FLOAT },
+ { GUID_WICPixelFormat64bppRGBA, DXGI_FORMAT_R16G16B16A16_UNORM },
+
+ { GUID_WICPixelFormat32bppRGBA, DXGI_FORMAT_R8G8B8A8_UNORM },
+ { GUID_WICPixelFormat32bppBGRA, DXGI_FORMAT_B8G8R8A8_UNORM }, // DXGI 1.1
+ { GUID_WICPixelFormat32bppBGR, DXGI_FORMAT_B8G8R8X8_UNORM }, // DXGI 1.1
+
+ { GUID_WICPixelFormat32bppRGBA1010102XR, DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM }, // DXGI 1.1
+ { GUID_WICPixelFormat32bppRGBA1010102, DXGI_FORMAT_R10G10B10A2_UNORM },
+ { GUID_WICPixelFormat32bppRGBE, DXGI_FORMAT_R9G9B9E5_SHAREDEXP },
+
+ { GUID_WICPixelFormat16bppBGRA5551, DXGI_FORMAT_B5G5R5A1_UNORM },
+ { GUID_WICPixelFormat16bppBGR565, DXGI_FORMAT_B5G6R5_UNORM },
+
+ { GUID_WICPixelFormat32bppGrayFloat, DXGI_FORMAT_R32_FLOAT },
+ { GUID_WICPixelFormat16bppGrayHalf, DXGI_FORMAT_R16_FLOAT },
+ { GUID_WICPixelFormat16bppGray, DXGI_FORMAT_R16_UNORM },
+ { GUID_WICPixelFormat8bppGray, DXGI_FORMAT_R8_UNORM },
+
+ { GUID_WICPixelFormat8bppAlpha, DXGI_FORMAT_A8_UNORM },
+
+ { GUID_WICPixelFormatBlackWhite, DXGI_FORMAT_R1_UNORM },
+};
+
+static bool g_WIC2 = false;
+
+namespace DirectX
+{
+
+//=====================================================================================
+// WIC Utilities
+//=====================================================================================
+
+DXGI_FORMAT _WICToDXGI( const GUID& guid )
+{
+ for( size_t i=0; i < _countof(g_WICFormats); ++i )
+ {
+ if ( memcmp( &g_WICFormats[i].wic, &guid, sizeof(GUID) ) == 0 )
+ return g_WICFormats[i].format;
+ }
+
+#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) || defined(_WIN7_PLATFORM_UPDATE)
+ if ( g_WIC2 )
+ {
+ if ( memcmp( &GUID_WICPixelFormat96bppRGBFloat, &guid, sizeof(GUID) ) == 0 )
+ return DXGI_FORMAT_R32G32B32_FLOAT;
+ }
+#endif
+
+ return DXGI_FORMAT_UNKNOWN;
+}
+
+bool _DXGIToWIC( DXGI_FORMAT format, GUID& guid )
+{
+ switch( format )
+ {
+ case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
+ memcpy( &guid, &GUID_WICPixelFormat32bppRGBA, sizeof(GUID) );
+ return true;
+
+ case DXGI_FORMAT_D32_FLOAT:
+ memcpy( &guid, &GUID_WICPixelFormat32bppGrayFloat, sizeof(GUID) );
+ return true;
+
+ case DXGI_FORMAT_D16_UNORM:
+ memcpy( &guid, &GUID_WICPixelFormat16bppGray, sizeof(GUID) );
+ return true;
+
+ case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+ memcpy( &guid, &GUID_WICPixelFormat32bppBGRA, sizeof(GUID) );
+ return true;
+
+ case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
+ memcpy( &guid, &GUID_WICPixelFormat32bppBGR, sizeof(GUID) );
+ return true;
+
+#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) || defined(_WIN7_PLATFORM_UPDATE)
+ case DXGI_FORMAT_R32G32B32_FLOAT:
+ if ( g_WIC2 )
+ {
+ memcpy( &guid, &GUID_WICPixelFormat96bppRGBFloat, sizeof(GUID) );
+ return true;
+ }
+ break;
+#endif
+
+ default:
+ for( size_t i=0; i < _countof(g_WICFormats); ++i )
+ {
+ if ( g_WICFormats[i].format == format )
+ {
+ memcpy( &guid, &g_WICFormats[i].wic, sizeof(GUID) );
+ return true;
+ }
+ }
+ break;
+ }
+
+ memcpy( &guid, &GUID_NULL, sizeof(GUID) );
+ return false;
+}
+
+bool _IsWIC2()
+{
+ return g_WIC2;
+}
+
+IWICImagingFactory* _GetWIC()
+{
+ static IWICImagingFactory* s_Factory = nullptr;
+
+ if ( s_Factory )
+ return s_Factory;
+
+#if(_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) || defined(_WIN7_PLATFORM_UPDATE)
+ HRESULT hr = CoCreateInstance(
+ CLSID_WICImagingFactory2,
+ nullptr,
+ CLSCTX_INPROC_SERVER,
+ __uuidof(IWICImagingFactory2),
+ (LPVOID*)&s_Factory
+ );
+
+ if ( SUCCEEDED(hr) )
+ {
+ // WIC2 is available on Windows 8 and Windows 7 SP1 with KB 2670838 installed
+ g_WIC2 = true;
+ }
+ else
+ {
+ hr = CoCreateInstance(
+ CLSID_WICImagingFactory1,
+ nullptr,
+ CLSCTX_INPROC_SERVER,
+ __uuidof(IWICImagingFactory),
+ (LPVOID*)&s_Factory
+ );
+
+ if ( FAILED(hr) )
+ {
+ s_Factory = nullptr;
+ return nullptr;
+ }
+ }
+#else
+ HRESULT hr = CoCreateInstance(
+ CLSID_WICImagingFactory,
+ nullptr,
+ CLSCTX_INPROC_SERVER,
+ __uuidof(IWICImagingFactory),
+ (LPVOID*)&s_Factory
+ );
+
+ if ( FAILED(hr) )
+ {
+ s_Factory = nullptr;
+ return nullptr;
+ }
+#endif
+
+ return s_Factory;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Public helper function to get common WIC codec GUIDs
+//-------------------------------------------------------------------------------------
+REFGUID GetWICCodec( _In_ WICCodecs codec )
+{
+ switch( codec )
+ {
+ case WIC_CODEC_BMP:
+ return GUID_ContainerFormatBmp;
+
+ case WIC_CODEC_JPEG:
+ return GUID_ContainerFormatJpeg;
+
+ case WIC_CODEC_PNG:
+ return GUID_ContainerFormatPng;
+
+ case WIC_CODEC_TIFF:
+ return GUID_ContainerFormatTiff;
+
+ case WIC_CODEC_GIF:
+ return GUID_ContainerFormatGif;
+
+ case WIC_CODEC_WMP:
+ return GUID_ContainerFormatWmp;
+
+ case WIC_CODEC_ICO:
+ return GUID_ContainerFormatIco;
+
+ default:
+ return GUID_NULL;
+ }
+}
+
+
+//=====================================================================================
+// DXGI Format Utilities
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// Returns bits-per-pixel for a given DXGI format, or 0 on failure
+//-------------------------------------------------------------------------------------
+size_t BitsPerPixel( DXGI_FORMAT fmt )
+{
+ switch( fmt )
+ {
+ case DXGI_FORMAT_R32G32B32A32_TYPELESS:
+ case DXGI_FORMAT_R32G32B32A32_FLOAT:
+ case DXGI_FORMAT_R32G32B32A32_UINT:
+ case DXGI_FORMAT_R32G32B32A32_SINT:
+ return 128;
+
+ case DXGI_FORMAT_R32G32B32_TYPELESS:
+ case DXGI_FORMAT_R32G32B32_FLOAT:
+ case DXGI_FORMAT_R32G32B32_UINT:
+ case DXGI_FORMAT_R32G32B32_SINT:
+ return 96;
+
+ case DXGI_FORMAT_R16G16B16A16_TYPELESS:
+ case DXGI_FORMAT_R16G16B16A16_FLOAT:
+ case DXGI_FORMAT_R16G16B16A16_UNORM:
+ case DXGI_FORMAT_R16G16B16A16_UINT:
+ case DXGI_FORMAT_R16G16B16A16_SNORM:
+ case DXGI_FORMAT_R16G16B16A16_SINT:
+ case DXGI_FORMAT_R32G32_TYPELESS:
+ case DXGI_FORMAT_R32G32_FLOAT:
+ case DXGI_FORMAT_R32G32_UINT:
+ case DXGI_FORMAT_R32G32_SINT:
+ case DXGI_FORMAT_R32G8X24_TYPELESS:
+ case DXGI_FORMAT_D32_FLOAT_S8X24_UINT:
+ case DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS:
+ case DXGI_FORMAT_X32_TYPELESS_G8X24_UINT:
+ return 64;
+
+ case DXGI_FORMAT_R10G10B10A2_TYPELESS:
+ case DXGI_FORMAT_R10G10B10A2_UNORM:
+ case DXGI_FORMAT_R10G10B10A2_UINT:
+ case DXGI_FORMAT_R11G11B10_FLOAT:
+ case DXGI_FORMAT_R8G8B8A8_TYPELESS:
+ case DXGI_FORMAT_R8G8B8A8_UNORM:
+ case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
+ case DXGI_FORMAT_R8G8B8A8_UINT:
+ case DXGI_FORMAT_R8G8B8A8_SNORM:
+ case DXGI_FORMAT_R8G8B8A8_SINT:
+ case DXGI_FORMAT_R16G16_TYPELESS:
+ case DXGI_FORMAT_R16G16_FLOAT:
+ case DXGI_FORMAT_R16G16_UNORM:
+ case DXGI_FORMAT_R16G16_UINT:
+ case DXGI_FORMAT_R16G16_SNORM:
+ case DXGI_FORMAT_R16G16_SINT:
+ case DXGI_FORMAT_R32_TYPELESS:
+ case DXGI_FORMAT_D32_FLOAT:
+ case DXGI_FORMAT_R32_FLOAT:
+ case DXGI_FORMAT_R32_UINT:
+ case DXGI_FORMAT_R32_SINT:
+ case DXGI_FORMAT_R24G8_TYPELESS:
+ case DXGI_FORMAT_D24_UNORM_S8_UINT:
+ case DXGI_FORMAT_R24_UNORM_X8_TYPELESS:
+ case DXGI_FORMAT_X24_TYPELESS_G8_UINT:
+ case DXGI_FORMAT_R9G9B9E5_SHAREDEXP:
+ case DXGI_FORMAT_R8G8_B8G8_UNORM:
+ case DXGI_FORMAT_G8R8_G8B8_UNORM:
+ case DXGI_FORMAT_B8G8R8A8_UNORM:
+ case DXGI_FORMAT_B8G8R8X8_UNORM:
+ case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM:
+ case DXGI_FORMAT_B8G8R8A8_TYPELESS:
+ case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+ case DXGI_FORMAT_B8G8R8X8_TYPELESS:
+ case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
+ return 32;
+
+ case DXGI_FORMAT_R8G8_TYPELESS:
+ case DXGI_FORMAT_R8G8_UNORM:
+ case DXGI_FORMAT_R8G8_UINT:
+ case DXGI_FORMAT_R8G8_SNORM:
+ case DXGI_FORMAT_R8G8_SINT:
+ case DXGI_FORMAT_R16_TYPELESS:
+ case DXGI_FORMAT_R16_FLOAT:
+ case DXGI_FORMAT_D16_UNORM:
+ case DXGI_FORMAT_R16_UNORM:
+ case DXGI_FORMAT_R16_UINT:
+ case DXGI_FORMAT_R16_SNORM:
+ case DXGI_FORMAT_R16_SINT:
+ case DXGI_FORMAT_B5G6R5_UNORM:
+ case DXGI_FORMAT_B5G5R5A1_UNORM:
+ return 16;
+
+ case DXGI_FORMAT_R8_TYPELESS:
+ case DXGI_FORMAT_R8_UNORM:
+ case DXGI_FORMAT_R8_UINT:
+ case DXGI_FORMAT_R8_SNORM:
+ case DXGI_FORMAT_R8_SINT:
+ case DXGI_FORMAT_A8_UNORM:
+ return 8;
+
+ case DXGI_FORMAT_R1_UNORM:
+ return 1;
+
+ case DXGI_FORMAT_BC1_TYPELESS:
+ case DXGI_FORMAT_BC1_UNORM:
+ case DXGI_FORMAT_BC1_UNORM_SRGB:
+ case DXGI_FORMAT_BC4_TYPELESS:
+ case DXGI_FORMAT_BC4_UNORM:
+ case DXGI_FORMAT_BC4_SNORM:
+ return 4;
+
+ case DXGI_FORMAT_BC2_TYPELESS:
+ case DXGI_FORMAT_BC2_UNORM:
+ case DXGI_FORMAT_BC2_UNORM_SRGB:
+ case DXGI_FORMAT_BC3_TYPELESS:
+ case DXGI_FORMAT_BC3_UNORM:
+ case DXGI_FORMAT_BC3_UNORM_SRGB:
+ case DXGI_FORMAT_BC5_TYPELESS:
+ case DXGI_FORMAT_BC5_UNORM:
+ case DXGI_FORMAT_BC5_SNORM:
+ case DXGI_FORMAT_BC6H_TYPELESS:
+ case DXGI_FORMAT_BC6H_UF16:
+ case DXGI_FORMAT_BC6H_SF16:
+ case DXGI_FORMAT_BC7_TYPELESS:
+ case DXGI_FORMAT_BC7_UNORM:
+ case DXGI_FORMAT_BC7_UNORM_SRGB:
+ return 8;
+
+#ifdef DXGI_1_2_FORMATS
+ case DXGI_FORMAT_B4G4R4A4_UNORM:
+ return 16;
+
+ // We don't support the video formats ( see IsVideo function )
+
+#endif // DXGI_1_2_FORMATS
+
+ default:
+ return 0;
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+// Computes the image row pitch in bytes, and the slice ptich (size in bytes of the image)
+// based on DXGI format, width, and height
+//-------------------------------------------------------------------------------------
+void ComputePitch( DXGI_FORMAT fmt, size_t width, size_t height,
+ size_t& rowPitch, size_t& slicePitch, DWORD flags )
+{
+ assert( IsValid(fmt) && !IsVideo(fmt) );
+
+ if ( IsCompressed(fmt) )
+ {
+ size_t bpb = ( fmt == DXGI_FORMAT_BC1_TYPELESS
+ || fmt == DXGI_FORMAT_BC1_UNORM
+ || fmt == DXGI_FORMAT_BC1_UNORM_SRGB
+ || fmt == DXGI_FORMAT_BC4_TYPELESS
+ || fmt == DXGI_FORMAT_BC4_UNORM
+ || fmt == DXGI_FORMAT_BC4_SNORM) ? 8 : 16;
+ size_t nbw = std::max<size_t>( 1, (width + 3) / 4 );
+ size_t nbh = std::max<size_t>( 1, (height + 3) / 4 );
+ rowPitch = nbw * bpb;
+
+ slicePitch = rowPitch * nbh;
+ }
+ else if ( IsPacked(fmt) )
+ {
+ rowPitch = ( ( width + 1 ) >> 1) * 4;
+
+ slicePitch = rowPitch * height;
+ }
+ else
+ {
+ size_t bpp;
+
+ if ( flags & CP_FLAGS_24BPP )
+ bpp = 24;
+ else if ( flags & CP_FLAGS_16BPP )
+ bpp = 16;
+ else if ( flags & CP_FLAGS_8BPP )
+ bpp = 8;
+ else
+ bpp = BitsPerPixel( fmt );
+
+ if ( flags & CP_FLAGS_LEGACY_DWORD )
+ {
+ // Special computation for some incorrectly created DDS files based on
+ // legacy DirectDraw assumptions about pitch alignment
+ rowPitch = ( ( width * bpp + 31 ) / 32 ) * sizeof(uint32_t);
+ slicePitch = rowPitch * height;
+ }
+ else
+ {
+ rowPitch = ( width * bpp + 7 ) / 8;
+ slicePitch = rowPitch * height;
+ }
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+// Converts to an SRGB equivalent type if available
+//-------------------------------------------------------------------------------------
+DXGI_FORMAT MakeSRGB( _In_ DXGI_FORMAT fmt )
+{
+ switch( fmt )
+ {
+ case DXGI_FORMAT_R8G8B8A8_UNORM:
+ return DXGI_FORMAT_R8G8B8A8_UNORM_SRGB;
+
+ case DXGI_FORMAT_BC1_UNORM:
+ return DXGI_FORMAT_BC1_UNORM_SRGB;
+
+ case DXGI_FORMAT_BC2_UNORM:
+ return DXGI_FORMAT_BC2_UNORM_SRGB;
+
+ case DXGI_FORMAT_BC3_UNORM:
+ return DXGI_FORMAT_BC3_UNORM_SRGB;
+
+ case DXGI_FORMAT_B8G8R8A8_UNORM:
+ return DXGI_FORMAT_B8G8R8A8_UNORM_SRGB;
+
+ case DXGI_FORMAT_B8G8R8X8_UNORM:
+ return DXGI_FORMAT_B8G8R8X8_UNORM_SRGB;
+
+ case DXGI_FORMAT_BC7_UNORM:
+ return DXGI_FORMAT_BC7_UNORM_SRGB;
+
+ default:
+ return fmt;
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+// Converts to a format to an equivalent TYPELESS format if available
+//-------------------------------------------------------------------------------------
+DXGI_FORMAT MakeTypeless( _In_ DXGI_FORMAT fmt )
+{
+ switch( fmt )
+ {
+ case DXGI_FORMAT_R32G32B32A32_FLOAT:
+ case DXGI_FORMAT_R32G32B32A32_UINT:
+ case DXGI_FORMAT_R32G32B32A32_SINT:
+ return DXGI_FORMAT_R32G32B32A32_TYPELESS;
+
+ case DXGI_FORMAT_R32G32B32_FLOAT:
+ case DXGI_FORMAT_R32G32B32_UINT:
+ case DXGI_FORMAT_R32G32B32_SINT:
+ return DXGI_FORMAT_R32G32B32_TYPELESS;
+
+ case DXGI_FORMAT_R16G16B16A16_FLOAT:
+ case DXGI_FORMAT_R16G16B16A16_UNORM:
+ case DXGI_FORMAT_R16G16B16A16_UINT:
+ case DXGI_FORMAT_R16G16B16A16_SNORM:
+ case DXGI_FORMAT_R16G16B16A16_SINT:
+ return DXGI_FORMAT_R16G16B16A16_TYPELESS;
+
+ case DXGI_FORMAT_R32G32_FLOAT:
+ case DXGI_FORMAT_R32G32_UINT:
+ case DXGI_FORMAT_R32G32_SINT:
+ return DXGI_FORMAT_R32G32_TYPELESS;
+
+ case DXGI_FORMAT_R10G10B10A2_UNORM:
+ case DXGI_FORMAT_R10G10B10A2_UINT:
+ return DXGI_FORMAT_R10G10B10A2_TYPELESS;
+
+ case DXGI_FORMAT_R8G8B8A8_UNORM:
+ case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
+ case DXGI_FORMAT_R8G8B8A8_UINT:
+ case DXGI_FORMAT_R8G8B8A8_SNORM:
+ case DXGI_FORMAT_R8G8B8A8_SINT:
+ return DXGI_FORMAT_R8G8B8A8_TYPELESS;
+
+ case DXGI_FORMAT_R16G16_FLOAT:
+ case DXGI_FORMAT_R16G16_UNORM:
+ case DXGI_FORMAT_R16G16_UINT:
+ case DXGI_FORMAT_R16G16_SNORM:
+ case DXGI_FORMAT_R16G16_SINT:
+ return DXGI_FORMAT_R16G16_TYPELESS;
+
+ case DXGI_FORMAT_D32_FLOAT:
+ case DXGI_FORMAT_R32_FLOAT:
+ case DXGI_FORMAT_R32_UINT:
+ case DXGI_FORMAT_R32_SINT:
+ return DXGI_FORMAT_R32_TYPELESS;
+
+ case DXGI_FORMAT_R8G8_UNORM:
+ case DXGI_FORMAT_R8G8_UINT:
+ case DXGI_FORMAT_R8G8_SNORM:
+ case DXGI_FORMAT_R8G8_SINT:
+ return DXGI_FORMAT_R8G8_TYPELESS;
+
+ case DXGI_FORMAT_R16_FLOAT:
+ case DXGI_FORMAT_D16_UNORM:
+ case DXGI_FORMAT_R16_UNORM:
+ case DXGI_FORMAT_R16_UINT:
+ case DXGI_FORMAT_R16_SNORM:
+ case DXGI_FORMAT_R16_SINT:
+ return DXGI_FORMAT_R16_TYPELESS;
+
+ case DXGI_FORMAT_R8_UNORM:
+ case DXGI_FORMAT_R8_UINT:
+ case DXGI_FORMAT_R8_SNORM:
+ case DXGI_FORMAT_R8_SINT:
+ case DXGI_FORMAT_A8_UNORM:
+ return DXGI_FORMAT_R8_TYPELESS;
+
+ case DXGI_FORMAT_BC1_UNORM:
+ case DXGI_FORMAT_BC1_UNORM_SRGB:
+ return DXGI_FORMAT_BC1_TYPELESS;
+
+ case DXGI_FORMAT_BC2_UNORM:
+ case DXGI_FORMAT_BC2_UNORM_SRGB:
+ return DXGI_FORMAT_BC2_TYPELESS;
+
+ case DXGI_FORMAT_BC3_UNORM:
+ case DXGI_FORMAT_BC3_UNORM_SRGB:
+ return DXGI_FORMAT_BC3_TYPELESS;
+
+ case DXGI_FORMAT_BC4_UNORM:
+ case DXGI_FORMAT_BC4_SNORM:
+ return DXGI_FORMAT_BC4_TYPELESS;
+
+ case DXGI_FORMAT_BC5_UNORM:
+ case DXGI_FORMAT_BC5_SNORM:
+ return DXGI_FORMAT_BC5_TYPELESS;
+
+ case DXGI_FORMAT_B8G8R8A8_UNORM:
+ case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+ return DXGI_FORMAT_B8G8R8A8_TYPELESS;
+
+ case DXGI_FORMAT_B8G8R8X8_UNORM:
+ case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
+ return DXGI_FORMAT_B8G8R8X8_TYPELESS;
+
+ case DXGI_FORMAT_BC6H_UF16:
+ case DXGI_FORMAT_BC6H_SF16:
+ return DXGI_FORMAT_BC6H_TYPELESS;
+
+ case DXGI_FORMAT_BC7_UNORM:
+ case DXGI_FORMAT_BC7_UNORM_SRGB:
+ return DXGI_FORMAT_BC7_TYPELESS;
+
+ default:
+ return fmt;
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+// Converts to a TYPELESS format to an equivalent UNORM format if available
+//-------------------------------------------------------------------------------------
+DXGI_FORMAT MakeTypelessUNORM( _In_ DXGI_FORMAT fmt )
+{
+ switch( fmt )
+ {
+ case DXGI_FORMAT_R16G16B16A16_TYPELESS:
+ return DXGI_FORMAT_R16G16B16A16_UNORM;
+
+ case DXGI_FORMAT_R10G10B10A2_TYPELESS:
+ return DXGI_FORMAT_R10G10B10A2_UNORM;
+
+ case DXGI_FORMAT_R8G8B8A8_TYPELESS:
+ return DXGI_FORMAT_R8G8B8A8_UNORM;
+
+ case DXGI_FORMAT_R16G16_TYPELESS:
+ return DXGI_FORMAT_R16G16_UNORM;
+
+ case DXGI_FORMAT_R8G8_TYPELESS:
+ return DXGI_FORMAT_R8G8_UNORM;
+
+ case DXGI_FORMAT_R16_TYPELESS:
+ return DXGI_FORMAT_R16_UNORM;
+
+ case DXGI_FORMAT_R8_TYPELESS:
+ return DXGI_FORMAT_R8_UNORM;
+
+ case DXGI_FORMAT_BC1_TYPELESS:
+ return DXGI_FORMAT_BC1_UNORM;
+
+ case DXGI_FORMAT_BC2_TYPELESS:
+ return DXGI_FORMAT_BC2_UNORM;
+
+ case DXGI_FORMAT_BC3_TYPELESS:
+ return DXGI_FORMAT_BC3_UNORM;
+
+ case DXGI_FORMAT_BC4_TYPELESS:
+ return DXGI_FORMAT_BC4_UNORM;
+
+ case DXGI_FORMAT_BC5_TYPELESS:
+ return DXGI_FORMAT_BC5_UNORM;
+
+ case DXGI_FORMAT_B8G8R8A8_TYPELESS:
+ return DXGI_FORMAT_B8G8R8A8_UNORM;
+
+ case DXGI_FORMAT_B8G8R8X8_TYPELESS:
+ return DXGI_FORMAT_B8G8R8X8_UNORM;
+
+ case DXGI_FORMAT_BC7_TYPELESS:
+ return DXGI_FORMAT_BC7_UNORM;
+
+ default:
+ return fmt;
+ }
+}
+
+
+//-------------------------------------------------------------------------------------
+// Converts to a TYPELESS format to an equivalent FLOAT format if available
+//-------------------------------------------------------------------------------------
+DXGI_FORMAT MakeTypelessFLOAT( _In_ DXGI_FORMAT fmt )
+{
+ switch( fmt )
+ {
+ case DXGI_FORMAT_R32G32B32A32_TYPELESS:
+ return DXGI_FORMAT_R32G32B32A32_FLOAT;
+
+ case DXGI_FORMAT_R32G32B32_TYPELESS:
+ return DXGI_FORMAT_R32G32B32_FLOAT;
+
+ case DXGI_FORMAT_R16G16B16A16_TYPELESS:
+ return DXGI_FORMAT_R16G16B16A16_FLOAT;
+
+ case DXGI_FORMAT_R32G32_TYPELESS:
+ return DXGI_FORMAT_R32G32_FLOAT;
+
+ case DXGI_FORMAT_R16G16_TYPELESS:
+ return DXGI_FORMAT_R16G16_FLOAT;
+
+ case DXGI_FORMAT_R32_TYPELESS:
+ return DXGI_FORMAT_R32_FLOAT;
+
+ case DXGI_FORMAT_R16_TYPELESS:
+ return DXGI_FORMAT_R16_FLOAT;
+
+ default:
+ return fmt;
+ }
+}
+
+
+//=====================================================================================
+// TexMetadata
+//=====================================================================================
+
+size_t TexMetadata::ComputeIndex( _In_ size_t mip, _In_ size_t item, _In_ size_t slice ) const
+{
+ if ( mip >= mipLevels )
+ return size_t(-1);
+
+ switch( dimension )
+ {
+ case TEX_DIMENSION_TEXTURE1D:
+ case TEX_DIMENSION_TEXTURE2D:
+ if ( slice > 0 )
+ return size_t(-1);
+
+ if ( item >= arraySize )
+ return size_t(-1);
+
+ return (item*( mipLevels ) + mip);
+
+ case TEX_DIMENSION_TEXTURE3D:
+ if ( item > 0 )
+ {
+ // No support for arrays of volumes
+ return size_t(-1);
+ }
+ else
+ {
+ size_t index = 0;
+ size_t d = depth;
+
+ for( size_t level = 0; level < mip; ++level )
+ {
+ index += d;
+ if ( d > 1 )
+ d >>= 1;
+ }
+
+ if ( slice >= d )
+ return size_t(-1);
+
+ index += slice;
+
+ return index;
+ }
+ break;
+
+ default:
+ return size_t(-1);
+ }
+}
+
+
+//=====================================================================================
+// Blob - Bitmap image container
+//=====================================================================================
+
+void Blob::Release()
+{
+ if ( _buffer )
+ {
+ _aligned_free( _buffer );
+ _buffer = nullptr;
+ }
+
+ _size = 0;
+}
+
+HRESULT Blob::Initialize( size_t size )
+{
+ if ( !size )
+ return E_INVALIDARG;
+
+ Release();
+
+ _buffer = _aligned_malloc( size, 16 );
+ if ( !_buffer )
+ {
+ Release();
+ return E_OUTOFMEMORY;
+ }
+
+ _size = size;
+
+ return S_OK;
+}
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/DirectXTexWIC.cpp b/thirdparty/directxtex/DirectXTex/DirectXTexWIC.cpp
new file mode 100644
index 00000000..05cb76a3
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/DirectXTexWIC.cpp
@@ -0,0 +1,946 @@
+//-------------------------------------------------------------------------------------
+// DirectXTexWIC.cpp
+//
+// DirectX Texture Library - WIC-based file reader/writer
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// http://go.microsoft.com/fwlink/?LinkId=248926
+//-------------------------------------------------------------------------------------
+
+#include "DirectXTexP.h"
+
+//-------------------------------------------------------------------------------------
+// WIC Pixel Format nearest conversion table
+//-------------------------------------------------------------------------------------
+
+struct WICConvert
+{
+ GUID source;
+ GUID target;
+};
+
+static WICConvert g_WICConvert[] =
+{
+ // Directly support the formats listed in XnaTexUtil::g_WICFormats, so no conversion required
+ // Note target GUID in this conversion table must be one of those directly supported formats.
+
+ { GUID_WICPixelFormat1bppIndexed, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM
+ { GUID_WICPixelFormat2bppIndexed, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM
+ { GUID_WICPixelFormat4bppIndexed, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM
+ { GUID_WICPixelFormat8bppIndexed, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM
+
+ { GUID_WICPixelFormat2bppGray, GUID_WICPixelFormat8bppGray }, // DXGI_FORMAT_R8_UNORM
+ { GUID_WICPixelFormat4bppGray, GUID_WICPixelFormat8bppGray }, // DXGI_FORMAT_R8_UNORM
+
+ { GUID_WICPixelFormat16bppGrayFixedPoint, GUID_WICPixelFormat16bppGrayHalf }, // DXGI_FORMAT_R16_FLOAT
+ { GUID_WICPixelFormat32bppGrayFixedPoint, GUID_WICPixelFormat32bppGrayFloat }, // DXGI_FORMAT_R32_FLOAT
+
+ { GUID_WICPixelFormat16bppBGR555, GUID_WICPixelFormat16bppBGRA5551 }, // DXGI_FORMAT_B5G5R5A1_UNORM
+ { GUID_WICPixelFormat32bppBGR101010, GUID_WICPixelFormat32bppRGBA1010102 }, // DXGI_FORMAT_R10G10B10A2_UNORM
+
+ { GUID_WICPixelFormat24bppBGR, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM
+ { GUID_WICPixelFormat24bppRGB, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM
+ { GUID_WICPixelFormat32bppPBGRA, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM
+ { GUID_WICPixelFormat32bppPRGBA, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM
+
+ { GUID_WICPixelFormat48bppRGB, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM
+ { GUID_WICPixelFormat48bppBGR, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM
+ { GUID_WICPixelFormat64bppBGRA, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM
+ { GUID_WICPixelFormat64bppPRGBA, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM
+ { GUID_WICPixelFormat64bppPBGRA, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM
+
+ { GUID_WICPixelFormat48bppRGBFixedPoint, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT
+ { GUID_WICPixelFormat48bppBGRFixedPoint, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT
+ { GUID_WICPixelFormat64bppRGBAFixedPoint, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT
+ { GUID_WICPixelFormat64bppBGRAFixedPoint, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT
+ { GUID_WICPixelFormat64bppRGBFixedPoint, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT
+ { GUID_WICPixelFormat64bppRGBHalf, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT
+ { GUID_WICPixelFormat48bppRGBHalf, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT
+
+ { GUID_WICPixelFormat128bppPRGBAFloat, GUID_WICPixelFormat128bppRGBAFloat }, // DXGI_FORMAT_R32G32B32A32_FLOAT
+ { GUID_WICPixelFormat128bppRGBFloat, GUID_WICPixelFormat128bppRGBAFloat }, // DXGI_FORMAT_R32G32B32A32_FLOAT
+ { GUID_WICPixelFormat128bppRGBAFixedPoint, GUID_WICPixelFormat128bppRGBAFloat }, // DXGI_FORMAT_R32G32B32A32_FLOAT
+ { GUID_WICPixelFormat128bppRGBFixedPoint, GUID_WICPixelFormat128bppRGBAFloat }, // DXGI_FORMAT_R32G32B32A32_FLOAT
+
+ { GUID_WICPixelFormat32bppCMYK, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM
+ { GUID_WICPixelFormat64bppCMYK, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM
+ { GUID_WICPixelFormat40bppCMYKAlpha, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM
+ { GUID_WICPixelFormat80bppCMYKAlpha, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM
+
+#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) || defined(_WIN7_PLATFORM_UPDATE)
+ { GUID_WICPixelFormat32bppRGB, GUID_WICPixelFormat32bppRGBA }, // DXGI_FORMAT_R8G8B8A8_UNORM
+ { GUID_WICPixelFormat64bppRGB, GUID_WICPixelFormat64bppRGBA }, // DXGI_FORMAT_R16G16B16A16_UNORM
+ { GUID_WICPixelFormat64bppPRGBAHalf, GUID_WICPixelFormat64bppRGBAHalf }, // DXGI_FORMAT_R16G16B16A16_FLOAT
+#endif
+
+ // We don't support n-channel formats
+};
+
+namespace DirectX
+{
+
+//-------------------------------------------------------------------------------------
+// Returns the DXGI format and optionally the WIC pixel GUID to convert to
+//-------------------------------------------------------------------------------------
+static DXGI_FORMAT _DetermineFormat( _In_ const WICPixelFormatGUID& pixelFormat, _In_ DWORD flags,
+ _Out_opt_ WICPixelFormatGUID* pConvert )
+{
+ if ( pConvert )
+ memset( pConvert, 0, sizeof(WICPixelFormatGUID) );
+
+ DXGI_FORMAT format = _WICToDXGI( pixelFormat );
+
+ if ( format == DXGI_FORMAT_UNKNOWN )
+ {
+ if ( memcmp( &GUID_WICPixelFormat96bppRGBFixedPoint, &pixelFormat, sizeof(WICPixelFormatGUID) ) == 0 )
+ {
+#if (_WIN32_WINNT >= 0x0602 /*_WIN32_WINNT_WIN8*/) || defined(_WIN7_PLATFORM_UPDATE)
+ if ( _IsWIC2() )
+ {
+ if ( pConvert )
+ memcpy( pConvert, &GUID_WICPixelFormat96bppRGBFloat, sizeof(WICPixelFormatGUID) );
+ format = DXGI_FORMAT_R32G32B32_FLOAT;
+ }
+ else
+#endif
+ {
+ if ( pConvert )
+ memcpy( pConvert, &GUID_WICPixelFormat128bppRGBAFloat, sizeof(WICPixelFormatGUID) );
+ format = DXGI_FORMAT_R32G32B32A32_FLOAT;
+ }
+ }
+ else
+ {
+ for( size_t i=0; i < _countof(g_WICConvert); ++i )
+ {
+ if ( memcmp( &g_WICConvert[i].source, &pixelFormat, sizeof(WICPixelFormatGUID) ) == 0 )
+ {
+ if ( pConvert )
+ memcpy( pConvert, &g_WICConvert[i].target, sizeof(WICPixelFormatGUID) );
+
+ format = _WICToDXGI( g_WICConvert[i].target );
+ assert( format != DXGI_FORMAT_UNKNOWN );
+ break;
+ }
+ }
+ }
+ }
+
+ // Handle special cases based on flags
+ switch (format)
+ {
+ case DXGI_FORMAT_B8G8R8A8_UNORM: // BGRA
+ case DXGI_FORMAT_B8G8R8X8_UNORM: // BGRX
+ if ( flags & WIC_FLAGS_FORCE_RGB )
+ {
+ format = DXGI_FORMAT_R8G8B8A8_UNORM;
+ if ( pConvert )
+ memcpy( pConvert, &GUID_WICPixelFormat32bppRGBA, sizeof(WICPixelFormatGUID) );
+ }
+ break;
+
+ case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM:
+ if ( flags & WIC_FLAGS_NO_X2_BIAS )
+ {
+ format = DXGI_FORMAT_R10G10B10A2_UNORM;
+ if ( pConvert )
+ memcpy( pConvert, &GUID_WICPixelFormat32bppRGBA1010102, sizeof(WICPixelFormatGUID) );
+ }
+ break;
+
+ case DXGI_FORMAT_B5G5R5A1_UNORM:
+ case DXGI_FORMAT_B5G6R5_UNORM:
+ if ( flags & WIC_FLAGS_NO_16BPP )
+ {
+ format = DXGI_FORMAT_R8G8B8A8_UNORM;
+ if ( pConvert )
+ memcpy( pConvert, &GUID_WICPixelFormat32bppRGBA, sizeof(WICPixelFormatGUID) );
+ }
+ break;
+
+ case DXGI_FORMAT_R1_UNORM:
+ if ( !(flags & WIC_FLAGS_ALLOW_MONO ) )
+ {
+ // By default we want to promote a black & white to gresycale since R1 is not a generally supported D3D format
+ format = DXGI_FORMAT_R8_UNORM;
+ if ( pConvert )
+ memcpy( pConvert, &GUID_WICPixelFormat8bppGray, sizeof(WICPixelFormatGUID) );
+ }
+ }
+
+ return format;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Determines metadata for image
+//-------------------------------------------------------------------------------------
+static HRESULT _DecodeMetadata( _In_ DWORD flags,
+ _In_ IWICBitmapDecoder *decoder, _In_ IWICBitmapFrameDecode *frame,
+ _Out_ TexMetadata& metadata, _Out_opt_ WICPixelFormatGUID* pConvert )
+{
+ if ( !decoder || !frame )
+ return E_POINTER;
+
+ memset( &metadata, 0, sizeof(TexMetadata) );
+ metadata.depth = 1;
+ metadata.mipLevels = 1;
+ metadata.dimension = TEX_DIMENSION_TEXTURE2D;
+
+ UINT w, h;
+ HRESULT hr = frame->GetSize( &w, &h );
+ if ( FAILED(hr) )
+ return hr;
+
+ metadata.width = w;
+ metadata.height = h;
+
+ if ( flags & WIC_FLAGS_ALL_FRAMES )
+ {
+ UINT fcount;
+ hr = decoder->GetFrameCount( &fcount );
+ if ( FAILED(hr) )
+ return hr;
+
+ metadata.arraySize = fcount;
+ }
+ else
+ metadata.arraySize = 1;
+
+ WICPixelFormatGUID pixelFormat;
+ hr = frame->GetPixelFormat( &pixelFormat );
+ if ( FAILED(hr) )
+ return hr;
+
+ metadata.format = _DetermineFormat( pixelFormat, flags, pConvert );
+ if ( metadata.format == DXGI_FORMAT_UNKNOWN )
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Decodes a single frame
+//-------------------------------------------------------------------------------------
+static HRESULT _DecodeSingleFrame( _In_ DWORD flags, _In_ const TexMetadata& metadata, _In_ const WICPixelFormatGUID& convertGUID,
+ _In_ IWICBitmapFrameDecode *frame, _Inout_ ScratchImage& image )
+{
+ if ( !frame )
+ return E_POINTER;
+
+ HRESULT hr = image.Initialize2D( metadata.format, metadata.width, metadata.height, 1, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ const Image *img = image.GetImage( 0, 0, 0 );
+ if ( !img )
+ return E_POINTER;
+
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ if ( memcmp( &convertGUID, &GUID_NULL, sizeof(GUID) ) == 0 )
+ {
+ hr = frame->CopyPixels( 0, static_cast<UINT>( img->rowPitch ), static_cast<UINT>( img->slicePitch ), img->pixels );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ else
+ {
+ ScopedObject<IWICFormatConverter> FC;
+ hr = pWIC->CreateFormatConverter( &FC );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FC->Initialize( frame, convertGUID, _GetWICDither( flags ), 0, 0, WICBitmapPaletteTypeCustom );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FC->CopyPixels( 0, static_cast<UINT>( img->rowPitch ), static_cast<UINT>( img->slicePitch ), img->pixels );
+ if ( FAILED(hr) )
+ return hr;
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Decodes an image array, resizing/format converting as needed
+//-------------------------------------------------------------------------------------
+static HRESULT _DecodeMultiframe( _In_ DWORD flags, _In_ const TexMetadata& metadata,
+ _In_ IWICBitmapDecoder *decoder, _Inout_ ScratchImage& image )
+{
+ if ( !decoder )
+ return E_POINTER;
+
+ HRESULT hr = image.Initialize2D( metadata.format, metadata.width, metadata.height, metadata.arraySize, 1 );
+ if ( FAILED(hr) )
+ return hr;
+
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ WICPixelFormatGUID sourceGUID;
+ if ( !_DXGIToWIC( metadata.format, sourceGUID ) )
+ return E_FAIL;
+
+ for( size_t index = 0; index < metadata.arraySize; ++index )
+ {
+ const Image* img = image.GetImage( 0, index, 0 );
+ if ( !img )
+ return E_POINTER;
+
+ ScopedObject<IWICBitmapFrameDecode> frame;
+ hr = decoder->GetFrame( static_cast<UINT>( index ), &frame );
+ if ( FAILED(hr) )
+ return hr;
+
+ WICPixelFormatGUID pfGuid;
+ hr = frame->GetPixelFormat( &pfGuid );
+ if ( FAILED(hr) )
+ return hr;
+
+ UINT w, h;
+ hr = frame->GetSize( &w, &h );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( memcmp( &pfGuid, &sourceGUID, sizeof(WICPixelFormatGUID) ) == 0 )
+ {
+ if ( w == metadata.width && h == metadata.height )
+ {
+ // This frame does not need resized or format converted, just copy...
+ hr = frame->CopyPixels( 0, static_cast<UINT>( img->rowPitch ), static_cast<UINT>( img->slicePitch ), img->pixels );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ else
+ {
+ // This frame needs resizing, but not format converted
+ ScopedObject<IWICBitmapScaler> scaler;
+ hr = pWIC->CreateBitmapScaler( &scaler );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = scaler->Initialize( frame.Get(), static_cast<UINT>( metadata.width ), static_cast<UINT>( metadata.height ), _GetWICInterp( flags ) );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = scaler->CopyPixels( 0, static_cast<UINT>( img->rowPitch ), static_cast<UINT>( img->slicePitch ), img->pixels );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ }
+ else
+ {
+ // This frame required format conversion
+ ScopedObject<IWICFormatConverter> FC;
+ hr = pWIC->CreateFormatConverter( &FC );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FC->Initialize( frame.Get(), pfGuid, _GetWICDither( flags ), 0, 0, WICBitmapPaletteTypeCustom );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( w == metadata.width && h == metadata.height )
+ {
+ // This frame is the same size, no need to scale
+ hr = FC->CopyPixels( 0, static_cast<UINT>( img->rowPitch ), static_cast<UINT>( img->slicePitch ), img->pixels );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ else
+ {
+ // This frame needs resizing and format converted
+ ScopedObject<IWICBitmapScaler> scaler;
+ hr = pWIC->CreateBitmapScaler( &scaler );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = scaler->Initialize( FC.Get(), static_cast<UINT>( metadata.width ), static_cast<UINT>( metadata.height ), _GetWICInterp( flags ) );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = scaler->CopyPixels( 0, static_cast<UINT>( img->rowPitch ), static_cast<UINT>( img->slicePitch ), img->pixels );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ }
+ }
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Encodes a single frame
+//-------------------------------------------------------------------------------------
+static HRESULT _EncodeImage( _In_ const Image& image, _In_ DWORD flags, _In_ IWICBitmapFrameEncode* frame, _In_opt_ IPropertyBag2* props, _In_opt_ const GUID* targetFormat )
+{
+ if ( !frame )
+ return E_INVALIDARG;
+
+ if ( !image.pixels )
+ return E_POINTER;
+
+ WICPixelFormatGUID pfGuid;
+ if ( !_DXGIToWIC( image.format, pfGuid ) )
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+
+ HRESULT hr = frame->Initialize( props );
+ if ( FAILED(hr) )
+ return hr;
+
+#ifdef _AMD64_
+ if ( (image.width > 0xFFFFFFFF) || (image.height > 0xFFFFFFFF) )
+ return E_INVALIDARG;
+#endif
+
+ hr = frame->SetSize( static_cast<UINT>( image.width ), static_cast<UINT>( image.height ) );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = frame->SetResolution( 72, 72 );
+ if ( FAILED(hr) )
+ return hr;
+
+ WICPixelFormatGUID targetGuid = (targetFormat) ? (*targetFormat) : pfGuid;
+ hr = frame->SetPixelFormat( &targetGuid );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( memcmp( &targetGuid, &pfGuid, sizeof(WICPixelFormatGUID) ) != 0 )
+ {
+ // Conversion required to write
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ ScopedObject<IWICBitmap> source;
+ hr = pWIC->CreateBitmapFromMemory( static_cast<UINT>( image.width ), static_cast<UINT>( image.height ), pfGuid,
+ static_cast<UINT>( image.rowPitch ), static_cast<UINT>( image.slicePitch ),
+ image.pixels, &source );
+ if ( FAILED(hr) )
+ return hr;
+
+ ScopedObject<IWICFormatConverter> FC;
+ hr = pWIC->CreateFormatConverter( &FC );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = FC->Initialize( source.Get(), targetGuid, _GetWICDither( flags ), 0, 0, WICBitmapPaletteTypeCustom );
+ if ( FAILED(hr) )
+ return hr;
+
+ WICRect rect = { 0, 0, static_cast<UINT>( image.width ), static_cast<UINT>( image.height ) };
+ hr = frame->WriteSource( FC.Get(), &rect );
+ if ( FAILED(hr) )
+ return hr;
+ }
+ else
+ {
+ // No conversion required
+ hr = frame->WritePixels( static_cast<UINT>( image.height ), static_cast<UINT>( image.rowPitch ), static_cast<UINT>( image.slicePitch ),
+ reinterpret_cast<uint8_t*>( image.pixels ) );
+ if ( FAILED(hr) )
+ return hr;
+ }
+
+ hr = frame->Commit();
+ if ( FAILED(hr) )
+ return hr;
+
+ return S_OK;
+}
+
+static HRESULT _EncodeSingleFrame( _In_ const Image& image, _In_ DWORD flags,
+ _In_ REFGUID guidContainerFormat, _Inout_ IStream* stream, _In_opt_ const GUID* targetFormat )
+{
+ if ( !stream )
+ return E_INVALIDARG;
+
+ // Initialize WIC
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ ScopedObject<IWICBitmapEncoder> encoder;
+ HRESULT hr = pWIC->CreateEncoder( guidContainerFormat, 0, &encoder );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = encoder->Initialize( stream, WICBitmapEncoderNoCache );
+ if ( FAILED(hr) )
+ return hr;
+
+ ScopedObject<IWICBitmapFrameEncode> frame;
+ ScopedObject<IPropertyBag2> props;
+ hr = encoder->CreateNewFrame( &frame, &props );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( memcmp( &guidContainerFormat, &GUID_ContainerFormatBmp, sizeof(WICPixelFormatGUID) ) == 0 )
+ {
+ // Opt-in to the Windows 8 support for writing 32-bit Windows BMP files with an alpha channel if supported
+ PROPBAG2 option = { 0 };
+ option.pstrName = L"EnableV5Header32bppBGRA";
+
+ VARIANT varValue;
+ varValue.vt = VT_BOOL;
+ varValue.boolVal = VARIANT_TRUE;
+ hr = props->Write( 1, &option, &varValue );
+ if ( FAILED(hr) )
+ {
+ // Fails on older versions of WIC, so we default to the null property bag
+ props.Reset();
+ }
+ }
+
+ hr = _EncodeImage( image, flags, frame.Get(), props.Get(), targetFormat );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = encoder->Commit();
+ if ( FAILED(hr) )
+ return hr;
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Encodes an image array
+//-------------------------------------------------------------------------------------
+static HRESULT _EncodeMultiframe( _In_count_(nimages) const Image* images, _In_ size_t nimages, _In_ DWORD flags,
+ _In_ REFGUID guidContainerFormat, _Inout_ IStream* stream, _In_opt_ const GUID* targetFormat )
+{
+ if ( !stream || nimages < 2 )
+ return E_INVALIDARG;
+
+ if ( !images )
+ return E_POINTER;
+
+ // Initialize WIC
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ ScopedObject<IWICBitmapEncoder> encoder;
+ HRESULT hr = pWIC->CreateEncoder( guidContainerFormat, 0, &encoder );
+ if ( FAILED(hr) )
+ return hr;
+
+ ScopedObject<IWICBitmapEncoderInfo> einfo;
+ hr = encoder->GetEncoderInfo( &einfo );
+ if ( FAILED(hr) )
+ return hr;
+
+ BOOL mframe = FALSE;
+ hr = einfo->DoesSupportMultiframe( &mframe );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( !mframe )
+ return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
+
+ hr = encoder->Initialize( stream, WICBitmapEncoderNoCache );
+ if ( FAILED(hr) )
+ return hr;
+
+ for( size_t index=0; index < nimages; ++index )
+ {
+ ScopedObject<IWICBitmapFrameEncode> frame;
+ hr = encoder->CreateNewFrame( &frame, nullptr );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = _EncodeImage( images[index], flags, frame.Get(), nullptr, targetFormat );
+ if ( FAILED(hr) )
+ return hr;
+ }
+
+ hr = encoder->Commit();
+ if ( FAILED(hr) )
+ return hr;
+
+ return S_OK;
+}
+
+
+//=====================================================================================
+// Entry-points
+//=====================================================================================
+
+//-------------------------------------------------------------------------------------
+// Obtain metadata from WIC-supported file in memory
+//-------------------------------------------------------------------------------------
+HRESULT GetMetadataFromWICMemory( LPCVOID pSource, size_t size, DWORD flags, TexMetadata& metadata )
+{
+ if ( !pSource || size == 0 )
+ return E_INVALIDARG;
+
+#ifdef _AMD64_
+ if ( size > 0xFFFFFFFF )
+ return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE );
+#endif
+
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ // Create input stream for memory
+ ScopedObject<IWICStream> stream;
+ HRESULT hr = pWIC->CreateStream( &stream );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = stream->InitializeFromMemory( reinterpret_cast<BYTE*>( const_cast<void*>( pSource ) ),
+ static_cast<UINT>( size ) );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Initialize WIC
+ ScopedObject<IWICBitmapDecoder> decoder;
+ hr = pWIC->CreateDecoderFromStream( stream.Get(), 0, WICDecodeMetadataCacheOnDemand, &decoder );
+ if ( FAILED(hr) )
+ return hr;
+
+ ScopedObject<IWICBitmapFrameDecode> frame;
+ hr = decoder->GetFrame( 0, &frame );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Get metadata
+ hr = _DecodeMetadata( flags, decoder.Get(), frame.Get(), metadata, 0 );
+ if ( FAILED(hr) )
+ return hr;
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Obtain metadata from WIC-supported file on disk
+//-------------------------------------------------------------------------------------
+HRESULT GetMetadataFromWICFile( LPCWSTR szFile, DWORD flags, TexMetadata& metadata )
+{
+ if ( !szFile )
+ return E_INVALIDARG;
+
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ // Initialize WIC
+ ScopedObject<IWICBitmapDecoder> decoder;
+ HRESULT hr = pWIC->CreateDecoderFromFilename( szFile, 0, GENERIC_READ, WICDecodeMetadataCacheOnDemand, &decoder );
+ if ( FAILED(hr) )
+ return hr;
+
+ ScopedObject<IWICBitmapFrameDecode> frame;
+ hr = decoder->GetFrame( 0, &frame );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Get metadata
+ hr = _DecodeMetadata( flags, decoder.Get(), frame.Get(), metadata, 0 );
+ if ( FAILED(hr) )
+ return hr;
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Load a WIC-supported file in memory
+//-------------------------------------------------------------------------------------
+HRESULT LoadFromWICMemory( LPCVOID pSource, size_t size, DWORD flags, TexMetadata* metadata, ScratchImage& image )
+{
+ if ( !pSource || size == 0 )
+ return E_INVALIDARG;
+
+#ifdef _AMD64_
+ if ( size > 0xFFFFFFFF )
+ return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE );
+#endif
+
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ image.Release();
+
+ // Create input stream for memory
+ ScopedObject<IWICStream> stream;
+ HRESULT hr = pWIC->CreateStream( &stream );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = stream->InitializeFromMemory( reinterpret_cast<uint8_t*>( const_cast<void*>( pSource ) ), static_cast<DWORD>( size ) );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Initialize WIC
+ ScopedObject<IWICBitmapDecoder> decoder;
+ hr = pWIC->CreateDecoderFromStream( stream.Get(), 0, WICDecodeMetadataCacheOnDemand, &decoder );
+ if ( FAILED(hr) )
+ return hr;
+
+ ScopedObject<IWICBitmapFrameDecode> frame;
+ hr = decoder->GetFrame( 0, &frame );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Get metadata
+ TexMetadata mdata;
+ WICPixelFormatGUID convertGUID = {0};
+ hr = _DecodeMetadata( flags, decoder.Get(), frame.Get(), mdata, &convertGUID );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( (mdata.arraySize > 1) && (flags & WIC_FLAGS_ALL_FRAMES) )
+ {
+ hr = _DecodeMultiframe( flags, mdata, decoder.Get(), image );
+ }
+ else
+ {
+ hr = _DecodeSingleFrame( flags, mdata, convertGUID, frame.Get(), image );
+ }
+
+ if ( FAILED(hr) )
+ {
+ image.Release();
+ return hr;
+ }
+
+ if ( metadata )
+ memcpy( metadata, &mdata, sizeof(TexMetadata) );
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Load a WIC-supported file from disk
+//-------------------------------------------------------------------------------------
+HRESULT LoadFromWICFile( LPCWSTR szFile, DWORD flags, TexMetadata* metadata, ScratchImage& image )
+{
+ if ( !szFile )
+ return E_INVALIDARG;
+
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ image.Release();
+
+ // Initialize WIC
+ ScopedObject<IWICBitmapDecoder> decoder;
+ HRESULT hr = pWIC->CreateDecoderFromFilename( szFile, 0, GENERIC_READ, WICDecodeMetadataCacheOnDemand, &decoder );
+ if ( FAILED(hr) )
+ return hr;
+
+ ScopedObject<IWICBitmapFrameDecode> frame;
+ hr = decoder->GetFrame( 0, &frame );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Get metadata
+ TexMetadata mdata;
+ WICPixelFormatGUID convertGUID = {0};
+ hr = _DecodeMetadata( flags, decoder.Get(), frame.Get(), mdata, &convertGUID );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( (mdata.arraySize > 1) && (flags & WIC_FLAGS_ALL_FRAMES) )
+ {
+ hr = _DecodeMultiframe( flags, mdata, decoder.Get(), image );
+ }
+ else
+ {
+ hr = _DecodeSingleFrame( flags, mdata, convertGUID, frame.Get(), image );
+ }
+
+ if ( FAILED(hr) )
+ {
+ image.Release();
+ return hr;
+ }
+
+ if ( metadata )
+ memcpy( metadata, &mdata, sizeof(TexMetadata) );
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Save a WIC-supported file to memory
+//-------------------------------------------------------------------------------------
+HRESULT SaveToWICMemory( const Image& image, DWORD flags, REFGUID guidContainerFormat, Blob& blob, const GUID* targetFormat )
+{
+ if ( !image.pixels )
+ return E_POINTER;
+
+ blob.Release();
+
+ ScopedObject<IStream> stream;
+ HRESULT hr = CreateStreamOnHGlobal( 0, TRUE, &stream );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = _EncodeSingleFrame( image, flags, guidContainerFormat, stream.Get(), targetFormat );
+ if ( FAILED(hr) )
+ return hr;
+
+ // Copy stream data into blob
+ STATSTG stat;
+ hr = stream->Stat( &stat, STATFLAG_NONAME );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( stat.cbSize.HighPart > 0 )
+ return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE );
+
+ hr = blob.Initialize( stat.cbSize.LowPart );
+ if ( FAILED(hr) )
+ return hr;
+
+ LARGE_INTEGER li = { 0 };
+ hr = stream->Seek( li, STREAM_SEEK_SET, 0 );
+ if ( FAILED(hr) )
+ return hr;
+
+ DWORD bytesRead;
+ hr = stream->Read( blob.GetBufferPointer(), static_cast<ULONG>( blob.GetBufferSize() ), &bytesRead );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( bytesRead != blob.GetBufferSize() )
+ return E_FAIL;
+
+ return S_OK;
+}
+
+HRESULT SaveToWICMemory( const Image* images, size_t nimages, DWORD flags, REFGUID guidContainerFormat, Blob& blob, const GUID* targetFormat )
+{
+ if ( !images || nimages == 0 )
+ return E_INVALIDARG;
+
+ blob.Release();
+
+ ScopedObject<IStream> stream;
+ HRESULT hr = CreateStreamOnHGlobal( 0, TRUE, &stream );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( nimages > 1 )
+ hr = _EncodeMultiframe( images, nimages, flags, guidContainerFormat, stream.Get(), targetFormat );
+ else
+ hr = _EncodeSingleFrame( images[0], flags, guidContainerFormat, stream.Get(), targetFormat );
+
+ if ( FAILED(hr) )
+ return hr;
+
+ // Copy stream data into blob
+ STATSTG stat;
+ hr = stream->Stat( &stat, STATFLAG_NONAME );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( stat.cbSize.HighPart > 0 )
+ return HRESULT_FROM_WIN32( ERROR_FILE_TOO_LARGE );
+
+ hr = blob.Initialize( stat.cbSize.LowPart );
+ if ( FAILED(hr) )
+ return hr;
+
+ LARGE_INTEGER li = { 0 };
+ hr = stream->Seek( li, STREAM_SEEK_SET, 0 );
+ if ( FAILED(hr) )
+ return hr;
+
+ DWORD bytesRead;
+ hr = stream->Read( blob.GetBufferPointer(), static_cast<ULONG>( blob.GetBufferSize() ), &bytesRead );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( bytesRead != blob.GetBufferSize() )
+ return E_FAIL;
+
+ return S_OK;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Save a WIC-supported file to disk
+//-------------------------------------------------------------------------------------
+HRESULT SaveToWICFile( const Image& image, DWORD flags, REFGUID guidContainerFormat, LPCWSTR szFile, const GUID* targetFormat )
+{
+ if ( !szFile )
+ return E_INVALIDARG;
+
+ if ( !image.pixels )
+ return E_POINTER;
+
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ ScopedObject<IWICStream> stream;
+ HRESULT hr = pWIC->CreateStream( &stream );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = stream->InitializeFromFilename( szFile, GENERIC_WRITE );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = _EncodeSingleFrame( image, flags, guidContainerFormat, stream.Get(), targetFormat );
+ if ( FAILED(hr) )
+ return hr;
+
+ return S_OK;
+}
+
+HRESULT SaveToWICFile( const Image* images, size_t nimages, DWORD flags, REFGUID guidContainerFormat, LPCWSTR szFile, const GUID* targetFormat )
+{
+ if ( !szFile || !images || nimages == 0 )
+ return E_INVALIDARG;
+
+ IWICImagingFactory* pWIC = _GetWIC();
+ if ( !pWIC )
+ return E_NOINTERFACE;
+
+ ScopedObject<IWICStream> stream;
+ HRESULT hr = pWIC->CreateStream( &stream );
+ if ( FAILED(hr) )
+ return hr;
+
+ hr = stream->InitializeFromFilename( szFile, GENERIC_WRITE );
+ if ( FAILED(hr) )
+ return hr;
+
+ if ( nimages > 1 )
+ hr = _EncodeMultiframe( images, nimages, flags, guidContainerFormat, stream.Get(), targetFormat );
+ else
+ hr = _EncodeSingleFrame( images[0], flags, guidContainerFormat, stream.Get(), targetFormat );
+
+ if ( FAILED(hr) )
+ return hr;
+
+ return S_OK;
+}
+
+}; // namespace
diff --git a/thirdparty/directxtex/DirectXTex/scoped.h b/thirdparty/directxtex/DirectXTex/scoped.h
new file mode 100644
index 00000000..81816069
--- /dev/null
+++ b/thirdparty/directxtex/DirectXTex/scoped.h
@@ -0,0 +1,70 @@
+//-------------------------------------------------------------------------------------
+// scoped.h
+//
+// Utility header with helper classes for exception-safe handling of resources
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//-------------------------------------------------------------------------------------
+
+#if defined(_MSC_VER) && (_MSC_VER > 1000)
+#pragma once
+#endif
+
+#include <assert.h>
+#include <memory>
+#include <malloc.h>
+
+//---------------------------------------------------------------------------------
+struct aligned_deleter { void operator()(void* p) { _aligned_free(p); } };
+
+typedef std::unique_ptr<float, aligned_deleter> ScopedAlignedArrayFloat;
+
+#ifdef USE_XNAMATH
+typedef std::unique_ptr<XMVECTOR, aligned_deleter> ScopedAlignedArrayXMVECTOR;
+#else
+typedef std::unique_ptr<DirectX::XMVECTOR, aligned_deleter> ScopedAlignedArrayXMVECTOR;
+#endif
+
+//---------------------------------------------------------------------------------
+struct handle_closer { void operator()(HANDLE h) { assert(h != INVALID_HANDLE_VALUE); if (h) CloseHandle(h); } };
+
+typedef std::unique_ptr<void, handle_closer> ScopedHandle;
+
+inline HANDLE safe_handle( HANDLE h ) { return (h == INVALID_HANDLE_VALUE) ? 0 : h; }
+
+
+//---------------------------------------------------------------------------------
+template<class T> class ScopedObject
+{
+public:
+ explicit ScopedObject( T *p = 0 ) : _pointer(p) {}
+ ~ScopedObject()
+ {
+ if ( _pointer )
+ {
+ _pointer->Release();
+ _pointer = nullptr;
+ }
+ }
+
+ bool IsNull() const { return (!_pointer); }
+
+ T& operator*() { return *_pointer; }
+ T* operator->() { return _pointer; }
+ T** operator&() { return &_pointer; }
+
+ void Reset(T *p = 0) { if ( _pointer ) { _pointer->Release(); } _pointer = p; }
+
+ T* Get() const { return _pointer; }
+
+private:
+ ScopedObject(const ScopedObject&);
+ ScopedObject& operator=(const ScopedObject&);
+
+ T* _pointer;
+};
diff --git a/thirdparty/directxtex/Microsoft Public License.rtf b/thirdparty/directxtex/Microsoft Public License.rtf
new file mode 100644
index 00000000..390c7adb
--- /dev/null
+++ b/thirdparty/directxtex/Microsoft Public License.rtf
@@ -0,0 +1,234 @@
+{\rtf1\adeflang1025\ansi\ansicpg1252\uc1\adeff1\deff0\stshfdbch0\stshfloch0\stshfhich0\stshfbi0\deflang1033\deflangfe1033\themelang1033\themelangfe0\themelangcs0{\fonttbl{\f0\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fbidi \fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
+{\f34\fbidi \froman\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria Math;}{\f36\fbidi \froman\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria;}{\f38\fbidi \fswiss\fcharset0\fprq2{\*\panose 020b0604030504040204}Tahoma;}
+{\f39\fbidi \fswiss\fcharset0\fprq2{\*\panose 00000000000000000000}Verdana;}{\flomajor\f31500\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
+{\fdbmajor\f31501\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fhimajor\f31502\fbidi \froman\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria;}
+{\fbimajor\f31503\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\flominor\f31504\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
+{\fdbminor\f31505\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fhiminor\f31506\fbidi \fswiss\fcharset0\fprq2{\*\panose 020f0502020204030204}Calibri;}
+{\fbiminor\f31507\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f40\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\f41\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}
+{\f43\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\f44\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\f45\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f46\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}
+{\f47\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\f48\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\f50\fbidi \fswiss\fcharset238\fprq2 Arial CE;}{\f51\fbidi \fswiss\fcharset204\fprq2 Arial Cyr;}
+{\f53\fbidi \fswiss\fcharset161\fprq2 Arial Greek;}{\f54\fbidi \fswiss\fcharset162\fprq2 Arial Tur;}{\f55\fbidi \fswiss\fcharset177\fprq2 Arial (Hebrew);}{\f56\fbidi \fswiss\fcharset178\fprq2 Arial (Arabic);}
+{\f57\fbidi \fswiss\fcharset186\fprq2 Arial Baltic;}{\f58\fbidi \fswiss\fcharset163\fprq2 Arial (Vietnamese);}{\f380\fbidi \froman\fcharset238\fprq2 Cambria Math CE;}{\f381\fbidi \froman\fcharset204\fprq2 Cambria Math Cyr;}
+{\f383\fbidi \froman\fcharset161\fprq2 Cambria Math Greek;}{\f384\fbidi \froman\fcharset162\fprq2 Cambria Math Tur;}{\f387\fbidi \froman\fcharset186\fprq2 Cambria Math Baltic;}{\f388\fbidi \froman\fcharset163\fprq2 Cambria Math (Vietnamese);}
+{\f400\fbidi \froman\fcharset238\fprq2 Cambria CE;}{\f401\fbidi \froman\fcharset204\fprq2 Cambria Cyr;}{\f403\fbidi \froman\fcharset161\fprq2 Cambria Greek;}{\f404\fbidi \froman\fcharset162\fprq2 Cambria Tur;}
+{\f407\fbidi \froman\fcharset186\fprq2 Cambria Baltic;}{\f408\fbidi \froman\fcharset163\fprq2 Cambria (Vietnamese);}{\f420\fbidi \fswiss\fcharset238\fprq2 Tahoma CE;}{\f421\fbidi \fswiss\fcharset204\fprq2 Tahoma Cyr;}
+{\f423\fbidi \fswiss\fcharset161\fprq2 Tahoma Greek;}{\f424\fbidi \fswiss\fcharset162\fprq2 Tahoma Tur;}{\f425\fbidi \fswiss\fcharset177\fprq2 Tahoma (Hebrew);}{\f426\fbidi \fswiss\fcharset178\fprq2 Tahoma (Arabic);}
+{\f427\fbidi \fswiss\fcharset186\fprq2 Tahoma Baltic;}{\f428\fbidi \fswiss\fcharset163\fprq2 Tahoma (Vietnamese);}{\f429\fbidi \fswiss\fcharset222\fprq2 Tahoma (Thai);}{\f430\fbidi \fswiss\fcharset238\fprq2 Verdana CE;}
+{\f431\fbidi \fswiss\fcharset204\fprq2 Verdana Cyr;}{\f433\fbidi \fswiss\fcharset161\fprq2 Verdana Greek;}{\f434\fbidi \fswiss\fcharset162\fprq2 Verdana Tur;}{\f437\fbidi \fswiss\fcharset186\fprq2 Verdana Baltic;}
+{\f438\fbidi \fswiss\fcharset163\fprq2 Verdana (Vietnamese);}{\flomajor\f31508\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\flomajor\f31509\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}
+{\flomajor\f31511\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\flomajor\f31512\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\flomajor\f31513\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}
+{\flomajor\f31514\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\flomajor\f31515\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\flomajor\f31516\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}
+{\fdbmajor\f31518\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\fdbmajor\f31519\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fdbmajor\f31521\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}
+{\fdbmajor\f31522\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\fdbmajor\f31523\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fdbmajor\f31524\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}
+{\fdbmajor\f31525\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\fdbmajor\f31526\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fhimajor\f31528\fbidi \froman\fcharset238\fprq2 Cambria CE;}
+{\fhimajor\f31529\fbidi \froman\fcharset204\fprq2 Cambria Cyr;}{\fhimajor\f31531\fbidi \froman\fcharset161\fprq2 Cambria Greek;}{\fhimajor\f31532\fbidi \froman\fcharset162\fprq2 Cambria Tur;}
+{\fhimajor\f31535\fbidi \froman\fcharset186\fprq2 Cambria Baltic;}{\fhimajor\f31536\fbidi \froman\fcharset163\fprq2 Cambria (Vietnamese);}{\fbimajor\f31538\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}
+{\fbimajor\f31539\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fbimajor\f31541\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fbimajor\f31542\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}
+{\fbimajor\f31543\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fbimajor\f31544\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fbimajor\f31545\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}
+{\fbimajor\f31546\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\flominor\f31548\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\flominor\f31549\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}
+{\flominor\f31551\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\flominor\f31552\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\flominor\f31553\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}
+{\flominor\f31554\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\flominor\f31555\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\flominor\f31556\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}
+{\fdbminor\f31558\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\fdbminor\f31559\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fdbminor\f31561\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}
+{\fdbminor\f31562\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\fdbminor\f31563\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fdbminor\f31564\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}
+{\fdbminor\f31565\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\fdbminor\f31566\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fhiminor\f31568\fbidi \fswiss\fcharset238\fprq2 Calibri CE;}
+{\fhiminor\f31569\fbidi \fswiss\fcharset204\fprq2 Calibri Cyr;}{\fhiminor\f31571\fbidi \fswiss\fcharset161\fprq2 Calibri Greek;}{\fhiminor\f31572\fbidi \fswiss\fcharset162\fprq2 Calibri Tur;}
+{\fhiminor\f31575\fbidi \fswiss\fcharset186\fprq2 Calibri Baltic;}{\fhiminor\f31576\fbidi \fswiss\fcharset163\fprq2 Calibri (Vietnamese);}{\fbiminor\f31578\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}
+{\fbiminor\f31579\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fbiminor\f31581\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fbiminor\f31582\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}
+{\fbiminor\f31583\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fbiminor\f31584\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fbiminor\f31585\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}
+{\fbiminor\f31586\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;
+\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\*\defchp }{\*\defpap
+\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 }\noqfpromote {\stylesheet{\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0
+\f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \snext0 \sqformat \spriority0 Normal;}{\s1\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\outlinelevel0\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0
+\f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink15 \sqformat \spriority9 heading 1;}{\s2\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\outlinelevel1\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0
+\f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink16 \sqformat \spriority9 heading 2;}{\*\cs10 \additive \ssemihidden Default Paragraph Font;}{\*
+\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tblind0\tblindtype3\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv
+\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs20\alang1025 \ltrch\fcs0 \fs20\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \snext11 \ssemihidden \sunhideused Normal Table;}{\*\cs15 \additive
+\rtlch\fcs1 \ab\af0\afs32 \ltrch\fcs0 \b\f36\fs32\kerning32 \sbasedon10 \slink1 \slocked \spriority9 Heading 1 Char;}{\*\cs16 \additive \rtlch\fcs1 \ab\ai\af0\afs28 \ltrch\fcs0 \b\i\f36\fs28 \sbasedon10 \slink2 \slocked \spriority9 Heading 2 Char;}{
+\s17\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \af38\afs16\alang1025 \ltrch\fcs0 \f38\fs16\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext17 \slink18 \ssemihidden \sunhideused \styrsid7424395 Balloon Text;}
+{\*\cs18 \additive \rtlch\fcs1 \af38\afs16 \ltrch\fcs0 \f38\fs16 \sbasedon10 \slink17 \slocked \ssemihidden \styrsid7424395 Balloon Text Char;}{\*\cs19 \additive \rtlch\fcs1 \af0\afs16 \ltrch\fcs0 \fs16
+\sbasedon10 \ssemihidden \sunhideused \styrsid4538388 annotation reference;}{\s20\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs20\alang1025 \ltrch\fcs0 \f1\fs20\lang1033\langfe1033\cgrid\langnp1033\langfenp1033
+\sbasedon0 \snext20 \slink21 \ssemihidden \sunhideused \styrsid4538388 annotation text;}{\*\cs21 \additive \rtlch\fcs1 \af1 \ltrch\fcs0 \f1 \sbasedon10 \slink20 \slocked \ssemihidden \styrsid4538388 Comment Text Char;}{
+\s22\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \ab\af1\afs20\alang1025 \ltrch\fcs0 \b\f1\fs20\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \sbasedon20 \snext20 \slink23 \ssemihidden \sunhideused \styrsid4538388
+annotation subject;}{\*\cs23 \additive \rtlch\fcs1 \ab\af1 \ltrch\fcs0 \b\f1 \sbasedon21 \slink22 \slocked \ssemihidden \styrsid4538388 Comment Subject Char;}}{\*\rsidtbl \rsid213160\rsid284417\rsid417145\rsid481196\rsid551334\rsid723397\rsid786968
+\rsid1382437\rsid1390003\rsid1521043\rsid1530955\rsid1708989\rsid1783212\rsid1903779\rsid2431884\rsid3165084\rsid3416120\rsid3419781\rsid3754103\rsid3768194\rsid3831520\rsid4538130\rsid4538388\rsid4552277\rsid4680449\rsid4729674\rsid4865270\rsid4987534
+\rsid5128131\rsid5186068\rsid5601121\rsid5864350\rsid6186044\rsid6311778\rsid6384507\rsid6434687\rsid6561471\rsid6910344\rsid6947552\rsid7033180\rsid7424395\rsid7682010\rsid7690850\rsid7744081\rsid8151618\rsid8196281\rsid8198206\rsid8342723\rsid8350925
+\rsid8722561\rsid8852349\rsid8934457\rsid8944153\rsid9573035\rsid9635349\rsid9638545\rsid9724918\rsid10044820\rsid10095979\rsid10228618\rsid10449644\rsid10494075\rsid11166278\rsid11166751\rsid11285353\rsid11366513\rsid11494815\rsid11932529\rsid12061202
+\rsid12533699\rsid12536400\rsid12916885\rsid13264736\rsid13322831\rsid13440556\rsid13455614\rsid13597357\rsid13768671\rsid14097590\rsid14157399\rsid14229900\rsid14305025\rsid14314735\rsid14436896\rsid14565916\rsid14572556\rsid14688892\rsid14752433
+\rsid14904394\rsid15086147\rsid15749945\rsid15814398\rsid15927751\rsid16071312\rsid16126175\rsid16279402\rsid16391569\rsid16404661\rsid16452939\rsid16537688\rsid16606866\rsid16674896}{\mmathPr\mmathFont34\mbrkBin0\mbrkBinSub0\msmallFrac0\mdispDef1
+\mlMargin0\mrMargin0\mdefJc1\mwrapIndent1440\mintLim0\mnaryLim1}{\info{\title Microsoft Permissive License (Ms-PL)}{\author Jonr}{\operator Chuck Walbourn}{\creatim\yr2007\mo2\dy23\hr15\min10}{\revtim\yr2011\mo8\dy15\hr15\min2}
+{\printim\yr2006\mo9\dy28\hr8\min46}{\version3}{\edmins1}{\nofpages1}{\nofwords391}{\nofchars2230}{\*\company Microsoft}{\nofcharsws2616}{\vern49273}}{\*\userprops {\propname _NewReviewCycle}\proptype30{\staticval }}{\*\xmlnstbl {\xmlns1 http://schemas.mi
+crosoft.com/office/word/2003/wordml}{\xmlns2 urn:schemas-microsoft-com:office:smarttags}}\paperw12240\paperh15840\margl1440\margr1440\margt1440\margb1440\gutter0\ltrsect
+\widowctrl\ftnbj\aenddoc\trackmoves0\trackformatting1\donotembedsysfont0\relyonvml0\donotembedlingdata1\grfdocevents0\validatexml0\showplaceholdtext0\ignoremixedcontent0\saveinvalidxml0\showxmlerrors0\hyphcaps0\horzdoc\dghspace120\dgvspace120
+\dghorigin1701\dgvorigin1984\dghshow0\dgvshow3\jcompress\viewkind1\viewscale100\splytwnine\ftnlytwnine\htmautsp\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nobrkwrptbl\snaptogridincell\allowfieldendsel\wrppunct\asianbrkrule\rsidroot10494075
+\newtblstyruls\nogrowautofit\utinl \fet0{\*\wgrffmtfilter 2450}\ilfomacatclnup0\ltrpar \sectd \ltrsect\linex0\sectdefaultcl\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta .}}
+{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}
+{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}\pard\plain \ltrpar
+\s1\ql \li0\ri0\sb180\nowidctlpar\wrapdefault\faauto\outlinelevel0\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \af1\afs31 \ltrch\fcs0
+\fs31\cf1\kerning36\insrsid10494075\charrsid14688892 Microsoft}{\rtlch\fcs1 \af1\afs31 \ltrch\fcs0 \fs31\cf1\kerning36\insrsid10494075 }{\rtlch\fcs1 \af1\afs31 \ltrch\fcs0 \fs31\cf1\kerning36\insrsid5601121 Public}{\rtlch\fcs1 \af1\afs31 \ltrch\fcs0
+\fs31\cf1\kerning36\insrsid14688892 }{\rtlch\fcs1 \af1\afs31 \ltrch\fcs0 \fs31\cf1\kerning36\insrsid10494075 License (Ms-PL}{\rtlch\fcs1 \af1\afs31 \ltrch\fcs0 \fs31\cf1\kerning36\insrsid4552277 )}{\rtlch\fcs1 \af1\afs31 \ltrch\fcs0
+\fs31\cf1\kerning36\insrsid10494075
+\par }\pard\plain \ltrpar\ql \li0\ri0\sl336\slmult1\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \ab\af39\afs17 \ltrch\fcs0
+\b\f39\fs17\insrsid10494075
+\par This license governs use of the accompanying software. If you use the software, you accept this license. If you do not accept the license, do not use the software.
+\par }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid6910344
+\par }\pard\plain \ltrpar\s2\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\outlinelevel1\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \ab\af39\afs23 \ltrch\fcs0
+\b\f39\fs23\insrsid10494075 1. Definitions
+\par }\pard\plain \ltrpar\ql \li0\ri0\sl336\slmult1\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \af39\afs17 \ltrch\fcs0
+\f39\fs17\insrsid10494075 The terms \'93reproduce,\'94 \'93reproduction}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid7744081 ,}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 \'94 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0
+\f39\fs17\insrsid551334 \'93derivative works,\'94}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid7744081\charrsid7744081 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 and \'93distribution\'94 have the same meaning here as under
+{\*\xmlopen\xmlns2{\factoidname place}}{\*\xmlopen\xmlns2{\factoidname country-region}}U.S.{\*\xmlclose}{\*\xmlclose} copyright law.
+\par }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid12536400 A \'93contribution\'94 is the original software}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid4865270 ,}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid12536400 }{\rtlch\fcs1
+\af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid11932529 or}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid12536400 any additions or changes to the software.
+\par }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid551334 A \'93c}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid551334\charrsid551334 ontributor\'94 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid12536400 is}{\rtlch\fcs1 \af39\afs17
+\ltrch\fcs0 \f39\fs17\insrsid12536400\charrsid551334 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid551334\charrsid551334 any person that }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid12536400
+distributes its contribution under this license.}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075
+\par }\pard \ltrpar\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0\pararsid14229900 {\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid4729674\delrsid4729674 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 \'93Licensed patents
+\'94 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid12536400 are }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3831520 a contributor\rquote s }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 patent claims }{\rtlch\fcs1
+\af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3831520 that }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 read directly on }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3831520 its contribution.}{\rtlch\fcs1 \af1 \ltrch\fcs0
+\insrsid14229900\charrsid14229900
+\par }\pard\plain \ltrpar\s2\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\outlinelevel1\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \ab\af39\afs23 \ltrch\fcs0
+\b\f39\fs23\insrsid5186068
+\par }{\rtlch\fcs1 \ab\af39\afs23 \ltrch\fcs0 \b\f39\fs23\insrsid10494075 2. Grant of Rights
+\par }\pard\plain \ltrpar\ql \li0\ri0\sl336\slmult1\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \af39\afs17 \ltrch\fcs0
+\f39\fs17\insrsid10494075 (A) Copyright Grant- Subject to the terms of this license, including the license conditions and limitations in section 3, }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3754103 each contributor }{\rtlch\fcs1 \af39\afs17
+\ltrch\fcs0 \f39\fs17\insrsid10494075 grants you a non-exclusive, worldwide, royalty-free copyright license to reproduce }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3754103 its contribution}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0
+\f39\fs17\insrsid10494075 , prepare derivative works of }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3754103 its contribution}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid12536400 ,}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0
+\f39\fs17\insrsid10494075 and distribute }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3754103 its contribution}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 or any derivative works that you create.
+\par (B) Patent Grant- Subject to the terms of this license, including the license conditions and limitations in section 3, }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid9724918 each contributor }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0
+\f39\fs17\insrsid10494075 grants you a non-exclusive, worldwide, royalty-free license under }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid15814398 its }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075
+licensed patents to make, have made, use, sell, offer for sale, }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid1390003 import, }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 and/or otherwise dispose of }{\rtlch\fcs1 \af39\afs17
+\ltrch\fcs0 \f39\fs17\insrsid8944153 its contribution in }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 the software or derivative works of }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid8944153 the contribution in }{\rtlch\fcs1
+\af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 the software.
+\par }\pard\plain \ltrpar\s2\ql \li0\ri0\nowidctlpar\wrapdefault\faauto\outlinelevel1\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \ab\af39\afs23 \ltrch\fcs0
+\b\f39\fs23\insrsid5186068
+\par }{\rtlch\fcs1 \ab\af39\afs23 \ltrch\fcs0 \b\f39\fs23\insrsid10494075 3. Conditions and Limitations
+\par }\pard\plain \ltrpar\ql \li0\ri0\sl336\slmult1\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0 \rtlch\fcs1 \af1\afs24\alang1025 \ltrch\fcs0 \f1\fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \af39\afs17 \ltrch\fcs0
+\f39\fs17\insrsid1530955 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 (A) No Trademark License- This license does not grant you rights to use }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid1708989 any contributors\rquote }{
+\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 name, logo, or trademarks.
+\par (B) If you }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid8934457 bring a patent claim against }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10095979 any contributor}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075
+ over patents that you }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid6947552 claim }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid7682010 are }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid6947552 infringe}{\rtlch\fcs1
+\af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid7682010 d by}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 the software, your }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid7682010 patent }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0
+\f39\fs17\insrsid10494075 license}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid7682010 from such contributor}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 to the software ends automatically.
+\par (C) If you distribute }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid3165084 any portion of }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075
+the software, you must retain all copyright, patent, trademark, and attribution notices that are present in the software.
+\par (D) If you distribute }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid15749945 any portion of the }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 software in source code form}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0
+\f39\fs17\insrsid14904394 ,}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 you may do so only under this license}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid6384507 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0
+\f39\fs17\insrsid14904394 by including }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 a complete copy of this license with your distribution}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid6384507 .}{\rtlch\fcs1 \af39\afs17
+\ltrch\fcs0 \f39\fs17\insrsid10494075 }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid6384507 I}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 f you distribute }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid15749945
+any portion of }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 the software in compiled or object code form}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid16452939 ,}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075
+ you may only do so under a license that complies with this license.
+\par }\pard \ltrpar\ql \li0\ri0\sl336\slmult1\nowidctlpar\wrapdefault\faauto\rin0\lin0\itap0\pararsid14572556 {\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 (E) The software is licensed \'93as-is.\'94 You bear the risk of using it. }{
+\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid284417 The contributors }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075
+give no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws which this license cannot change. To the extent permitted under your local laws, }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0
+\f39\fs17\insrsid1783212 the contributors }{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0 \f39\fs17\insrsid10494075 exclude the implied warranties of merchantability, fitness for a particular purpose and non-infringement.}{\rtlch\fcs1 \af39\afs17 \ltrch\fcs0
+\f39\fs17\insrsid10494075\charrsid14572556
+\par }{\*\themedata 504b030414000600080000002100e9de0fbfff0000001c020000130000005b436f6e74656e745f54797065735d2e786d6cac91cb4ec3301045f748fc83e52d4a
+9cb2400825e982c78ec7a27cc0c8992416c9d8b2a755fbf74cd25442a820166c2cd933f79e3be372bd1f07b5c3989ca74aaff2422b24eb1b475da5df374fd9ad
+5689811a183c61a50f98f4babebc2837878049899a52a57be670674cb23d8e90721f90a4d2fa3802cb35762680fd800ecd7551dc18eb899138e3c943d7e503b6
+b01d583deee5f99824e290b4ba3f364eac4a430883b3c092d4eca8f946c916422ecab927f52ea42b89a1cd59c254f919b0e85e6535d135a8de20f20b8c12c3b0
+0c895fcf6720192de6bf3b9e89ecdbd6596cbcdd8eb28e7c365ecc4ec1ff1460f53fe813d3cc7f5b7f020000ffff0300504b030414000600080000002100a5d6
+a7e7c0000000360100000b0000005f72656c732f2e72656c73848fcf6ac3300c87ef85bd83d17d51d2c31825762fa590432fa37d00e1287f68221bdb1bebdb4f
+c7060abb0884a4eff7a93dfeae8bf9e194e720169aaa06c3e2433fcb68e1763dbf7f82c985a4a725085b787086a37bdbb55fbc50d1a33ccd311ba548b6309512
+0f88d94fbc52ae4264d1c910d24a45db3462247fa791715fd71f989e19e0364cd3f51652d73760ae8fa8c9ffb3c330cc9e4fc17faf2ce545046e37944c69e462
+a1a82fe353bd90a865aad41ed0b5b8f9d6fd010000ffff0300504b0304140006000800000021006b799616830000008a0000001c0000007468656d652f746865
+6d652f7468656d654d616e616765722e786d6c0ccc4d0ac3201040e17da17790d93763bb284562b2cbaebbf600439c1a41c7a0d29fdbd7e5e38337cedf14d59b
+4b0d592c9c070d8a65cd2e88b7f07c2ca71ba8da481cc52c6ce1c715e6e97818c9b48d13df49c873517d23d59085adb5dd20d6b52bd521ef2cdd5eb9246a3d8b
+4757e8d3f729e245eb2b260a0238fd010000ffff0300504b03041400060008000000210096b5ade296060000501b0000160000007468656d652f7468656d652f
+7468656d65312e786d6cec594f6fdb3614bf0fd87720746f6327761a07758ad8b19b2d4d1bc46e871e698996d850a240d2497d1bdae38001c3ba618715d86d87
+615b8116d8a5fb34d93a6c1dd0afb0475292c5585e9236d88aad3e2412f9e3fbff1e1fa9abd7eec70c1d1221294fda5efd72cd4324f1794093b0eddd1ef62fad
+79482a9c0498f184b4bd2991deb58df7dfbb8ad755446282607d22d771db8b944ad79796a40fc3585ee62949606ecc458c15bc8a702910f808e8c66c69b9565b
+5d8a314d3c94e018c8de1a8fa94fd05093f43672e23d06af89927ac06762a049136785c10607758d9053d965021d62d6f6804fc08f86e4bef210c352c144dbab
+999fb7b4717509af678b985ab0b6b4ae6f7ed9ba6c4170b06c788a705430adf71bad2b5b057d03606a1ed7ebf5babd7a41cf00b0ef83a6569632cd467faddec9
+699640f6719e76b7d6ac355c7c89feca9cccad4ea7d36c65b258a206641f1b73f8b5da6a6373d9c11b90c537e7f08dce66b7bbeae00dc8e257e7f0fd2badd586
+8b37a088d1e4600ead1ddaef67d40bc898b3ed4af81ac0d76a197c86826828a24bb318f3442d8ab518dfe3a20f000d6458d104a9694ac6d88728eee2782428d6
+0cf03ac1a5193be4cbb921cd0b495fd054b5bd0f530c1931a3f7eaf9f7af9e3f45c70f9e1d3ff8e9f8e1c3e3073f5a42ceaa6d9c84e5552fbffdeccfc71fa33f
+9e7ef3f2d117d57859c6fffac327bffcfc793510d26726ce8b2f9ffcf6ecc98baf3efdfdbb4715f04d814765f890c644a29be408edf3181433567125272371be
+15c308d3f28acd249438c19a4b05fd9e8a1cf4cd296699771c393ac4b5e01d01e5a30a787d72cf1178108989a2159c77a2d801ee72ce3a5c545a6147f32a9979
+3849c26ae66252c6ed637c58c5bb8b13c7bfbd490a75330f4b47f16e441c31f7184e140e494214d273fc80900aedee52ead87597fa824b3e56e82e451d4c2b4d
+32a423279a668bb6690c7e9956e90cfe766cb37b077538abd27a8b1cba48c80acc2a841f12e698f13a9e281c57911ce298950d7e03aba84ac8c154f8655c4f2a
+f074481847bd804859b5e696007d4b4edfc150b12addbecba6b18b148a1e54d1bc81392f23b7f84137c2715a851dd0242a633f900710a218ed715505dfe56e86
+e877f0034e16bafb0e258ebb4faf06b769e888340b103d3311da9750aa9d0a1cd3e4efca31a3508f6d0c5c5c398602f8e2ebc71591f5b616e24dd893aa3261fb
+44f95d843b5974bb5c04f4edafb95b7892ec1108f3f98de75dc97d5772bdff7cc95d94cf672db4b3da0a6557f70db629362d72bcb0431e53c6066acac80d699a
+6409fb44d08741bdce9c0e4971624a2378cceaba830b05366b90e0ea23aaa241845368b0eb9e2612ca8c742851ca251ceccc70256d8d87265dd96361531f186c
+3d9058edf2c00eafe8e1fc5c509031bb4d680e9f39a3154de0accc56ae644441edd76156d7429d995bdd88664a9dc3ad50197c38af1a0c16d684060441db0256
+5e85f3b9660d0713cc48a0ed6ef7dedc2dc60b17e92219e180643ed27acffba86e9c94c78ab90980d8a9f0913ee49d62b512b79626fb06dccee2a432bbc60276
+b9f7dec44b7904cfbca4f3f6443ab2a49c9c2c41476dafd55c6e7ac8c769db1bc399161ee314bc2e75cf8759081743be1236ec4f4d6693e5336fb672c5dc24a8
+c33585b5fb9cc24e1d4885545b58463634cc5416022cd19cacfccb4d30eb45296023fd35a458598360f8d7a4003bbaae25e331f155d9d9a5116d3bfb9a95523e
+51440ca2e0088dd844ec6370bf0e55d027a012ae264c45d02f708fa6ad6da6dce29c255df9f6cae0ec38666984b372ab5334cf640b37795cc860de4ae2816e95
+b21be5ceaf8a49f90b52a51cc6ff3355f47e0237052b81f6800fd7b802239daf6d8f0b1571a8426944fdbe80c6c1d40e8816b88b8569082ab84c36ff0539d4ff
+6dce591a26ade1c0a7f669880485fd484582903d284b26fa4e2156cff62e4b9265844c4495c495a9157b440e091bea1ab8aaf7760f4510eaa69a6465c0e04ec6
+9ffb9e65d028d44d4e39df9c1a52ecbd3607fee9cec7263328e5d661d3d0e4f62f44acd855ed7ab33cdf7bcb8ae889599bd5c8b3029895b6825696f6af29c239
+b75a5bb1e6345e6ee6c28117e73586c1a2214ae1be07e93fb0ff51e133fb65426fa843be0fb515c187064d0cc206a2fa926d3c902e907670048d931db4c1a449
+59d366ad93b65abe595f70a75bf03d616c2dd959fc7d4e6317cd99cbcec9c58b34766661c7d6766ca1a9c1b327531486c6f941c638c67cd22a7f75e2a37be0e8
+2db8df9f30254d30c1372581a1f51c983c80e4b71ccdd28dbf000000ffff0300504b0304140006000800000021000dd1909fb60000001b010000270000007468
+656d652f7468656d652f5f72656c732f7468656d654d616e616765722e786d6c2e72656c73848f4d0ac2301484f78277086f6fd3ba109126dd88d0add40384e4
+350d363f2451eced0dae2c082e8761be9969bb979dc9136332de3168aa1a083ae995719ac16db8ec8e4052164e89d93b64b060828e6f37ed1567914b284d2624
+52282e3198720e274a939cd08a54f980ae38a38f56e422a3a641c8bbd048f7757da0f19b017cc524bd62107bd5001996509affb3fd381a89672f1f165dfe5141
+73d9850528a2c6cce0239baa4c04ca5bbabac4df000000ffff0300504b01022d0014000600080000002100e9de0fbfff0000001c020000130000000000000000
+0000000000000000005b436f6e74656e745f54797065735d2e786d6c504b01022d0014000600080000002100a5d6a7e7c0000000360100000b00000000000000
+000000000000300100005f72656c732f2e72656c73504b01022d00140006000800000021006b799616830000008a0000001c0000000000000000000000000019
+0200007468656d652f7468656d652f7468656d654d616e616765722e786d6c504b01022d001400060008000000210096b5ade296060000501b00001600000000
+000000000000000000d60200007468656d652f7468656d652f7468656d65312e786d6c504b01022d00140006000800000021000dd1909fb60000001b01000027
+00000000000000000000000000a00900007468656d652f7468656d652f5f72656c732f7468656d654d616e616765722e786d6c2e72656c73504b050600000000050005005d0100009b0a00000000}
+{\*\colorschememapping 3c3f786d6c2076657273696f6e3d22312e302220656e636f64696e673d225554462d3822207374616e64616c6f6e653d22796573223f3e0d0a3c613a636c724d
+617020786d6c6e733a613d22687474703a2f2f736368656d61732e6f70656e786d6c666f726d6174732e6f72672f64726177696e676d6c2f323030362f6d6169
+6e22206267313d226c743122207478313d22646b3122206267323d226c743222207478323d22646b322220616363656e74313d22616363656e74312220616363
+656e74323d22616363656e74322220616363656e74333d22616363656e74332220616363656e74343d22616363656e74342220616363656e74353d22616363656e74352220616363656e74363d22616363656e74362220686c696e6b3d22686c696e6b2220666f6c486c696e6b3d22666f6c486c696e6b222f3e}
+{\*\latentstyles\lsdstimax267\lsdlockeddef0\lsdsemihiddendef1\lsdunhideuseddef1\lsdqformatdef0\lsdprioritydef99{\lsdlockedexcept \lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority0 \lsdlocked0 Normal;
+\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 1;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 2;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 3;
+\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 4;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 5;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 6;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 7;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 8;
+\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 9;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 1;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 2;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 3;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 4;
+\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 5;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 6;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 7;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 8;\lsdqformat1 \lsdpriority39 \lsdlocked0 toc 9;
+\lsdqformat1 \lsdpriority35 \lsdlocked0 caption;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority10 \lsdlocked0 Title;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority11 \lsdlocked0 Subtitle;
+\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority22 \lsdlocked0 Strong;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority20 \lsdlocked0 Emphasis;\lsdsemihidden0 \lsdunhideused0 \lsdpriority59 \lsdlocked0 Table Grid;
+\lsdunhideused0 \lsdlocked0 Placeholder Text;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority1 \lsdlocked0 No Spacing;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 1;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 1;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 1;\lsdunhideused0 \lsdlocked0 Revision;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority34 \lsdlocked0 List Paragraph;
+\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority29 \lsdlocked0 Quote;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority30 \lsdlocked0 Intense Quote;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 1;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 1;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 1;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 2;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 2;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 2;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 2;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 2;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 3;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 3;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 3;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 3;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 4;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 4;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 4;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 4;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 4;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 5;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 5;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 5;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 5;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 5;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 6;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 6;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 6;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 6;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority19 \lsdlocked0 Subtle Emphasis;
+\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority21 \lsdlocked0 Intense Emphasis;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority31 \lsdlocked0 Subtle Reference;
+\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority32 \lsdlocked0 Intense Reference;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority33 \lsdlocked0 Book Title;\lsdpriority37 \lsdlocked0 Bibliography;
+\lsdqformat1 \lsdpriority39 \lsdlocked0 TOC Heading;}}{\*\datastore 0105000002000000180000004d73786d6c322e534158584d4c5265616465722e362e3000000000000000000000060000
+d0cf11e0a1b11ae1000000000000000000000000000000003e000300feff090006000000000000000000000001000000010000000000000000100000feffffff00000000feffffff0000000000000000ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+fffffffffffffffffdfffffffeffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+ffffffffffffffffffffffffffffffff52006f006f007400200045006e00740072007900000000000000000000000000000000000000000000000000000000000000000000000000000000000000000016000500ffffffffffffffffffffffff0c6ad98892f1d411a65f0040963251e5000000000000000000000000808a
+33fc965bcc01feffffff00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff00000000000000000000000000000000000000000000000000000000
+00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff0000000000000000000000000000000000000000000000000000
+000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff000000000000000000000000000000000000000000000000
+0000000000000000000000000000000000000000000000000105000000000000}} \ No newline at end of file
diff --git a/thirdparty/directxtex/ReadMe.txt b/thirdparty/directxtex/ReadMe.txt
new file mode 100644
index 00000000..0423b920
--- /dev/null
+++ b/thirdparty/directxtex/ReadMe.txt
@@ -0,0 +1,192 @@
+DIRECTX TEXTURE LIBRARY (DirectXTex)
+------------------------------------
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+November 15, 2012
+
+This package contains DirectXTex, a shared source library for reading and writing DDS
+files, and performing various texture content processing operations including
+resizing, format conversion, mip-map generation, block compression for Direct3D runtime
+texture resources, and height-map to normal-map conversion. This library makes
+use of the Windows Image Component (WIC) APIs. It also includes a simple .TGA reader and
+writer since this image file format is commonly used for texture content processing pipelines,
+but is not currently supported by a built-in WIC codec.
+
+The source is written for Visual C++ 2010 using the Direct3D headers from either
+a current DirectX SDK or Windows SDK. It can also be compiled using Visual Studio 2012 and the
+Windows SDK 8.0 headers.
+
+It is recommended that you make use of Visual C++ 2010 Service Pack 1 or VS 2012, and
+Windows 7 Service Pack 1 or Windows 8.
+
+DDSTextureLoader\
+ This contains a streamlined version of the DirectX SDK sample DDSWithoutD3DX11 texture
+ loading code for a simple light-weight runtime DDS loader. This version only supports
+ Direct3D 11 and performs no runtime pixel data conversions (i.e. 24bpp legacy DDS files
+ always fail). This is ideal for runtime usage, and supports the full complement of
+ Direct3D 11 texture resources (1D, 2D, volume maps, cubemaps, mipmap levels,
+ texture arrays, BC formats, etc.).
+
+WICTextureLoader\
+ This contains a Direct3D 11 2D texture loader that uses WIC to load a bitmap
+ (BMP, JPEG, PNG, HD Photo, or other WIC supported file container), resize if needed
+ based on the current feature level (or by explicit parameter), format convert to a
+ DXGI_FORMAT if required, and then create a 2D texture. Furthermore, if a Direct3D 11
+ device context is provided and the current device supports it for the given pixel format,
+ it will auto-generate mipmaps. Note this does not support 1D textures, volume textures,
+ cubemaps, or texture arrays. DDSTextureLoader is recommended for fully "precooked" textures
+ for maximum performance and image quality, but this loader can be useful for creating
+ simple 2D texture from standard image files at runtime.
+
+ Note: This function is not thread-safe if given a non-NULL device context for the auto-gen
+ mip-map support.
+
+DirectXTex\
+ This contains the DirectXTex library. This includes a full-featured DDS reader and writer
+ including legacy format conversions, a TGA reader and writer, a WIC-based bitmap reader and
+ writer (BMP, JPEG, PNG, TIFF, and HD Photo), and various texture processing functions. This
+ is intended primarily for tool usage.
+
+ Note that the majority of the header files here are intended for internal implementation
+ of the library only (BC.h, DDS.h, DirectXTexP.h, and scoped.h). Only DirectXTex.h is
+ meant as a 'public' header for the library.
+
+Texconv\
+ This DirectXTex sample is an implementation of the "texconv" command-line texture utility
+ from the DirectX SDK utilizing DirectXTex rather than D3DX.
+
+ It supports the same arguments as the Texture Conversion Tool Extended (texconvex.exe) DirectX
+ SDK utility. See <http://msdn.microsoft.com/en-us/library/ee422506.aspx>. The primary differences
+ are the -10 and -11 arguments are not applicable; the filter names (POINT, LINEAR, CUBIC,
+ FANT, POINT_DITHER, LINEAR_DITHER, CUBIC_DITHER, FANT_DITHER); and support for the .TGA file format.
+ This also includes support for JPEG XR/HD Photo bitmap formats (see
+ <http://blogs.msdn.com/b/chuckw/archive/2011/01/19/known-issue-texconvex.aspx>)
+
+DDSView\
+ This DirectXTex sample is a simple Direct3D 11-based viewer for DDS files. For array textures
+ or volume maps, the "<" and ">" keyboard keys will show different images contained in the DDS.
+ The "1" through "0" keys can also be used to jump to a specific image index.
+
+XNAMath\
+ This contains a copy of XNA Math version 2.05, which is an updated version of the library. This is
+ required if building content with USE_XNAMATH (the default for the VS 2010 projects). The VS 2012
+ projects use DirectXMath in the Windows SDK 8.0 instead.
+ For details see
+ <http://blogs.msdn.com/b/chuckw/archive/2012/06/22/xna-math-version-2-05-smoothing-the-transition-to-directxmath.aspx>
+
+All content and source code for this package except XNA Math are bound to the Microsoft Public License (Ms-PL)
+<http://www.microsoft.com/en-us/openness/licenses.aspx#MPL>. The XNA Math library is subject
+to the DirectX SDK (June 2010) End-User License Agreement.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+
+
+------------------------------------
+RELEASE NOTES
+
+* The DirectXTex library does not support block compression or decompression of mipmapped non-power-of-2 textures,
+ although DDSTextureLoader will load these files correctly if the underlying device supports it.
+
+* The DirectXTex library only supports CLAMP filtering, and does not yet support MIRROR or WRAP filtering
+ (WIC operations only support CLAMP filtering).
+
+* The DirectXTex library only supports box and POINT filtering, and does not support LINEAR or CUBIC filtering,
+ for 3D volume mipmap-generation.
+
+* Due to the underlying Windows BMP WIC codec, alpha channels are not supported for 16bpp or 32bpp BMP pixel format files. The Windows 8
+ version of the Windows BMP WIC codec does support 32bpp pixel formats with alpha when using the BITMAPV5HEADER file header. Note the updated
+ WIC is available on Windows 7 SP1 with KB 2670838 installed.
+
+* The WIC conversion cases currently ignore TEX_FILTER_SRGB_IN and TEX_FILTER_SRGB_OUT out.
+
+* For the DXGI 1.1 version of DirectXTex, 4:4:4:4 pixel format DDS files are always expanded to 8:8:8:8 upon load since DXGI 1.0
+ and DXGI 1.1 versions of Direct3D do not support these resource formats. The DXGI 1.2 versions of DirectXTex and DDSTextureLoader
+ make use of the DXGI_FORMAT_B4G4R4A4_UNORM format instead.
+
+* While DXGI 1.0 and DXGI 1.1 include 5:6:5 (DXGI_FORMAT_B5G6R5_UNORM) and 5:5:5:1 (DXGI_FORMAT_B5G5R5A1_UNORM)
+ pixel format enumerations, the DirectX 10.x and 11.0 Runtimes do not support these formats for use with Direct3D. The DirectX 11.1 runtime,
+ DXGI 1.2, and the WDDM 1.2 driver model fully support 16bpp formats (5:6:5, 5:5:5:1, and 4:4:4:4). The DXGI 1.2 version of WICTextureLoader
+ will load 16bpp pixel images as 5:6:5 or 5:5:5:1 rather than expand them to 32bpp RGBA.
+
+* WICTextureLoader cannot load .TGA files unless the system has a 3rd party WIC codec installed. You must use the DirectXTex
+ library for TGA file format support without relying on an add-on WIC codec.
+
+* Loading of 96bpp floating-point TIFF files results in a corrupted image prior to Windows 8. This fix is available on Windows 7 SP1 with
+ KB 2670838 installed.
+
+
+------------------------------------
+RELEASE HISTORY
+
+November 15, 2012
+ Added support for WIC2 when available on Windows 8 and Windows 7 with KB 2670838
+ Added optional targetGUID parameter to SaveWIC* APIs to influence final container pixel format choice
+ Fixed bug in SaveDDS* which was generating invalid DDS files for 1D dimension textures
+ Improved robustness of CaptureTexture when resolving MSAA source textures
+ Sync'd DDSTextureLoader, ScreenGrab, and WICTextureLoader standalone versions with latest DirectXTK release
+
+September 28, 2012
+ Added ScreenGrab module for creating runtime screenshots
+ Renamed project files for better naming consistency
+ New Typeless utilities for DirectXTex
+ Some minor code cleanup for DirectXTex's WIC writer function
+ Bug fixes and new -tu/-tf options for texconv
+
+June 22, 2012
+ Moved to using XNA Math 2.05 instead of XNA Math 2.04 for USE_XNAMATH builds
+ Fixed BGR vs. RGB color channel swizzle problem with 24bpp legacy .DDS files in DirectXTex
+ Update to DirectXTex WIC and WICTextureLoader for additional 96bpp float format handling on Windows 8
+
+May 31, 2012
+ Minor fix for DDSTextureLoader's retry fallback that can happen with 10level9 feature levels
+ Switched to use "_DEBUG" instead of "DEBUG" and cleaned up debug warnings
+ added Metro style application project files for DirectXTex
+
+April 20, 2012
+ DirectTex's WIC-based writer opts-in for the Windows 8 BMP encoder option for writing 32 bpp RGBA files with the BITMAPV5HEADER
+
+March 30, 2012
+ WICTextureLoader updated with Windows 8 WIC pixel formats
+ DirectXTex updated with limited non-power-of-2 texture support and TEX_FILTER_SEPARATE_ALPHA option
+ Texconv updated with '-sepalpha' command-line option
+ Added USE_XNAMATH control define to build DirectXTex using either XNAMath or DirectXMath
+ Added VS 2012 project files (which use DirectXMath instead of XNAMath and define DXGI_1_2_FORMATS)
+
+March 15, 2012
+ Fix for resource leak in CreateShaderResourceView() Direct3D 11 helper function in DirectXTex
+
+March 5, 2012
+ Fix for too much temp memory allocated by WICTextureLoader; cleaned up legacy 'min/max' macro usage in DirectXTex
+
+February 21, 2012
+ WICTextureLoader updated to handle systems and device drivers without BGRA or 16bpp format support
+
+February 20, 2012
+ Some code cleanup for DirectXTex and DDSTextureLoader
+ Fixed bug in 10:10:10:2 format fixup in the LoadDDSFromMemory function
+ Fixed bugs in "non-zero alpha" special-case handling in LoadTGAFromFile
+ Fixed bug in _SwizzleScanline when copying alpha channel for BGRA<->RGBA swizzling
+
+February 11, 2012
+ Update of DDSTextureLoader to also build in Metro style apps; added WICTextureLoader
+ Added CMYK WIC pixel formats to the DirectXTex conversion table
+
+January 30, 2012
+ Minor code-cleanup for DirectXTex to enable use of PCH through 'directxtexp.h' header
+
+January 24, 2011
+ Some code-cleanup for DirectXTex
+ Added DXGI 1.2 implementation for DDSTextureLoader and DirectXTex guarded with DXGI_1_2_FORMATS compiliation define
+
+December 16, 2011
+ Fixed x64 compilation warnings in DDSTextureLoader
+
+November 30, 2011
+ Fixed some of the constants used in IsSupportedTexture(),
+ added ability to strip off top levels of mips in DDSTextureLoader,
+ changed DirectXTex to use CoCreateInstance rather than LoadLibrary to obtain the WIC factory,
+ a few minor /analyze related annotations for DirectXTex
+
+October 27, 2011
+ Original release \ No newline at end of file
diff --git a/thirdparty/directxtex/XNAMath/xnamath.h b/thirdparty/directxtex/XNAMath/xnamath.h
new file mode 100644
index 00000000..941af11a
--- /dev/null
+++ b/thirdparty/directxtex/XNAMath/xnamath.h
@@ -0,0 +1,3397 @@
+/************************************************************************
+* *
+* XNAMath.h -- SIMD C++ Math library for Windows and Xbox 360 *
+* *
+* Copyright (c) Microsoft Corp. All rights reserved. *
+* *
+************************************************************************/
+
+#if defined(_MSC_VER) && (_MSC_VER > 1000)
+#pragma once
+#endif
+
+#ifndef __XNAMATH_H__
+#define __XNAMATH_H__
+
+#ifdef __XBOXMATH_H__
+#error XNAMATH and XBOXMATH are incompatible in the same compilation module. Use one or the other.
+#endif
+
+#define XNAMATH_VERSION 205
+
+#if !defined(_XM_X64_) && !defined(_XM_X86_)
+#if defined(_M_AMD64) || defined(_AMD64_)
+#define _XM_X64_
+#elif defined(_M_IX86) || defined(_X86_)
+#define _XM_X86_
+#endif
+#endif
+
+
+#if !defined(_XM_BIGENDIAN_) && !defined(_XM_LITTLEENDIAN_)
+#if defined(_XM_X64_) || defined(_XM_X86_)
+#define _XM_LITTLEENDIAN_
+#elif defined(_XBOX_VER)
+#define _XM_BIGENDIAN_
+#else
+#error xnamath.h does not support this target
+#endif
+#endif
+
+#if defined(_XM_X86_) || defined(_XM_X64_)
+#define _XM_SSE_INTRINSICS_
+#if !defined(__cplusplus) && !defined(_XM_NO_INTRINSICS_)
+#error xnamath.h only supports C compliation for Xbox 360 targets and no intrinsics cases for x86/x64
+#endif
+#elif defined(_XBOX_VER)
+#if !defined(__VMX128_SUPPORTED) && !defined(_XM_NO_INTRINSICS_)
+#error xnamath.h requires VMX128 compiler support for XBOX 360
+#endif // !__VMX128_SUPPORTED && !_XM_NO_INTRINSICS_
+#define _XM_VMX128_INTRINSICS_
+#elif !defined(_XM_NO_INTRINSICS_)
+#error xnamath.h does not support this target
+#endif
+
+
+#if defined(_XM_SSE_INTRINSICS_)
+#ifndef _XM_NO_INTRINSICS_
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#endif
+#elif defined(_XM_VMX128_INTRINSICS_)
+#error This version of xnamath.h does not support Xbox 360
+#endif
+
+#if defined(_XM_SSE_INTRINSICS_)
+#pragma warning(push)
+#pragma warning(disable:4985)
+#endif
+#include <math.h>
+#if defined(_XM_SSE_INTRINSICS_)
+#pragma warning(pop)
+#endif
+
+
+#include <sal.h>
+
+
+#if !defined(XMINLINE)
+#if !defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#define XMINLINE __inline
+#else
+#define XMINLINE __forceinline
+#endif
+#endif
+
+#if !defined(XMFINLINE)
+#define XMFINLINE __forceinline
+#endif
+
+#if !defined(XMDEBUG)
+#if defined(_DEBUG)
+#define XMDEBUG
+#endif
+#endif // !XMDEBUG
+
+#if !defined(XMASSERT)
+#if defined(_PREFAST_)
+#define XMASSERT(Expression) __analysis_assume((Expression))
+#elif defined(XMDEBUG) // !_PREFAST_
+#define XMASSERT(Expression) ((VOID)((Expression) || (XMAssert(#Expression, __FILE__, __LINE__), 0)))
+#else // !XMDEBUG
+#define XMASSERT(Expression) ((VOID)0)
+#endif // !XMDEBUG
+#endif // !XMASSERT
+
+#if !defined(XM_NO_ALIGNMENT)
+#define _DECLSPEC_ALIGN_16_ __declspec(align(16))
+#else
+#define _DECLSPEC_ALIGN_16_
+#endif
+
+
+#if defined(_MSC_VER) && (_MSC_VER<1500) && (_MSC_VER>=1400)
+#define _XM_ISVS2005_
+#endif
+
+/****************************************************************************
+ *
+ * Constant definitions
+ *
+ ****************************************************************************/
+
+#define XM_PI 3.141592654f
+#define XM_2PI 6.283185307f
+#define XM_1DIVPI 0.318309886f
+#define XM_1DIV2PI 0.159154943f
+#define XM_PIDIV2 1.570796327f
+#define XM_PIDIV4 0.785398163f
+
+#define XM_SELECT_0 0x00000000
+#define XM_SELECT_1 0xFFFFFFFF
+
+#define XM_PERMUTE_0X 0x00010203
+#define XM_PERMUTE_0Y 0x04050607
+#define XM_PERMUTE_0Z 0x08090A0B
+#define XM_PERMUTE_0W 0x0C0D0E0F
+#define XM_PERMUTE_1X 0x10111213
+#define XM_PERMUTE_1Y 0x14151617
+#define XM_PERMUTE_1Z 0x18191A1B
+#define XM_PERMUTE_1W 0x1C1D1E1F
+
+#define XM_CRMASK_CR6 0x000000F0
+#define XM_CRMASK_CR6TRUE 0x00000080
+#define XM_CRMASK_CR6FALSE 0x00000020
+#define XM_CRMASK_CR6BOUNDS XM_CRMASK_CR6FALSE
+
+
+#define XM_CACHE_LINE_SIZE 64
+
+/****************************************************************************
+ *
+ * Macros
+ *
+ ****************************************************************************/
+
+// Unit conversion
+
+XMFINLINE FLOAT XMConvertToRadians(FLOAT fDegrees) { return fDegrees * (XM_PI / 180.0f); }
+XMFINLINE FLOAT XMConvertToDegrees(FLOAT fRadians) { return fRadians * (180.0f / XM_PI); }
+
+// Condition register evaluation proceeding a recording (Rc) comparison
+
+#define XMComparisonAllTrue(CR) (((CR) & XM_CRMASK_CR6TRUE) == XM_CRMASK_CR6TRUE)
+#define XMComparisonAnyTrue(CR) (((CR) & XM_CRMASK_CR6FALSE) != XM_CRMASK_CR6FALSE)
+#define XMComparisonAllFalse(CR) (((CR) & XM_CRMASK_CR6FALSE) == XM_CRMASK_CR6FALSE)
+#define XMComparisonAnyFalse(CR) (((CR) & XM_CRMASK_CR6TRUE) != XM_CRMASK_CR6TRUE)
+#define XMComparisonMixed(CR) (((CR) & XM_CRMASK_CR6) == 0)
+#define XMComparisonAllInBounds(CR) (((CR) & XM_CRMASK_CR6BOUNDS) == XM_CRMASK_CR6BOUNDS)
+#define XMComparisonAnyOutOfBounds(CR) (((CR) & XM_CRMASK_CR6BOUNDS) != XM_CRMASK_CR6BOUNDS)
+
+
+#define XMMin(a, b) (((a) < (b)) ? (a) : (b))
+#define XMMax(a, b) (((a) > (b)) ? (a) : (b))
+
+/****************************************************************************
+ *
+ * Data types
+ *
+ ****************************************************************************/
+
+#pragma warning(push)
+#pragma warning(disable:4201 4365 4324)
+
+#ifdef _XM_BIGENDIAN_
+#pragma bitfield_order(push)
+#pragma bitfield_order(lsb_to_msb)
+#endif
+
+//------------------------------------------------------------------------------
+#if defined(_XM_NO_INTRINSICS_) && !defined(_XBOX_VER)
+// The __vector4 structure is an intrinsic on Xbox but must be separately defined
+// for x86/x64
+typedef struct __vector4
+{
+ union
+ {
+ float vector4_f32[4];
+ unsigned int vector4_u32[4];
+#ifndef XM_STRICT_VECTOR4
+ struct
+ {
+ FLOAT x;
+ FLOAT y;
+ FLOAT z;
+ FLOAT w;
+ };
+ FLOAT v[4];
+ UINT u[4];
+#endif // !XM_STRICT_VECTOR4
+ };
+} __vector4;
+#endif // _XM_NO_INTRINSICS_
+
+//------------------------------------------------------------------------------
+#if (defined (_XM_X86_) || defined(_XM_X64_)) && defined(_XM_NO_INTRINSICS_)
+typedef UINT __vector4i[4];
+#else
+typedef __declspec(align(16)) UINT __vector4i[4];
+#endif
+
+//------------------------------------------------------------------------------
+// Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte
+// boundary and mapped to hardware vector registers
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+typedef __m128 XMVECTOR;
+#else
+typedef __vector4 XMVECTOR;
+#endif
+
+// Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86 and Xbox 360, but not for other targets
+#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+typedef const XMVECTOR FXMVECTOR;
+#elif defined(_XM_X86_) && !defined(_XM_NO_INTRINSICS_)
+typedef const XMVECTOR FXMVECTOR;
+#elif defined(__cplusplus)
+typedef const XMVECTOR& FXMVECTOR;
+#else
+typedef const XMVECTOR FXMVECTOR;
+#endif
+
+// Fix-up for (4th+) XMVECTOR parameters to pass in-register for Xbox 360 and by reference otherwise
+#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+typedef const XMVECTOR CXMVECTOR;
+#elif defined(__cplusplus)
+typedef const XMVECTOR& CXMVECTOR;
+#else
+typedef const XMVECTOR CXMVECTOR;
+#endif
+
+//------------------------------------------------------------------------------
+// Conversion types for constants
+typedef _DECLSPEC_ALIGN_16_ struct XMVECTORF32 {
+ union {
+ float f[4];
+ XMVECTOR v;
+ };
+
+#if defined(__cplusplus)
+ inline operator XMVECTOR() const { return v; }
+ inline operator const float*() const { return f; }
+#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
+ inline operator __m128i() const { return reinterpret_cast<const __m128i *>(&v)[0]; }
+ inline operator __m128d() const { return reinterpret_cast<const __m128d *>(&v)[0]; }
+#endif
+#endif // __cplusplus
+} XMVECTORF32;
+
+typedef _DECLSPEC_ALIGN_16_ struct XMVECTORI32 {
+ union {
+ INT i[4];
+ XMVECTOR v;
+ };
+#if defined(__cplusplus)
+ inline operator XMVECTOR() const { return v; }
+#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
+ inline operator __m128i() const { return reinterpret_cast<const __m128i *>(&v)[0]; }
+ inline operator __m128d() const { return reinterpret_cast<const __m128d *>(&v)[0]; }
+#endif
+#endif // __cplusplus
+} XMVECTORI32;
+
+typedef _DECLSPEC_ALIGN_16_ struct XMVECTORU8 {
+ union {
+ BYTE u[16];
+ XMVECTOR v;
+ };
+#if defined(__cplusplus)
+ inline operator XMVECTOR() const { return v; }
+#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
+ inline operator __m128i() const { return reinterpret_cast<const __m128i *>(&v)[0]; }
+ inline operator __m128d() const { return reinterpret_cast<const __m128d *>(&v)[0]; }
+#endif
+#endif // __cplusplus
+} XMVECTORU8;
+
+typedef _DECLSPEC_ALIGN_16_ struct XMVECTORU32 {
+ union {
+ UINT u[4];
+ XMVECTOR v;
+ };
+#if defined(__cplusplus)
+ inline operator XMVECTOR() const { return v; }
+#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
+ inline operator __m128i() const { return reinterpret_cast<const __m128i *>(&v)[0]; }
+ inline operator __m128d() const { return reinterpret_cast<const __m128d *>(&v)[0]; }
+#endif
+#endif // __cplusplus
+} XMVECTORU32;
+
+//------------------------------------------------------------------------------
+// Vector operators
+#if defined(__cplusplus) && !defined(XM_NO_OPERATOR_OVERLOADS)
+
+XMVECTOR operator+ (FXMVECTOR V);
+XMVECTOR operator- (FXMVECTOR V);
+
+XMVECTOR& operator+= (XMVECTOR& V1, FXMVECTOR V2);
+XMVECTOR& operator-= (XMVECTOR& V1, FXMVECTOR V2);
+XMVECTOR& operator*= (XMVECTOR& V1, FXMVECTOR V2);
+XMVECTOR& operator/= (XMVECTOR& V1, FXMVECTOR V2);
+XMVECTOR& operator*= (XMVECTOR& V, FLOAT S);
+XMVECTOR& operator/= (XMVECTOR& V, FLOAT S);
+
+XMVECTOR operator+ (FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR operator- (FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR operator* (FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR operator/ (FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR operator* (FXMVECTOR V, FLOAT S);
+XMVECTOR operator* (FLOAT S, FXMVECTOR V);
+XMVECTOR operator/ (FXMVECTOR V, FLOAT S);
+
+#endif // __cplusplus && !XM_NO_OPERATOR_OVERLOADS
+
+//------------------------------------------------------------------------------
+// Matrix type: Sixteen 32 bit floating point components aligned on a
+// 16 byte boundary and mapped to four hardware vector registers
+#if (defined(_XM_X86_) || defined(_XM_X64_)) && defined(_XM_NO_INTRINSICS_)
+typedef struct _XMMATRIX
+#else
+typedef _DECLSPEC_ALIGN_16_ struct _XMMATRIX
+#endif
+{
+#if defined(_XM_NO_INTRINSICS_) || !defined(XM_STRICT_MATRIX)
+ union
+ {
+ XMVECTOR r[4];
+ struct
+ {
+ FLOAT _11, _12, _13, _14;
+ FLOAT _21, _22, _23, _24;
+ FLOAT _31, _32, _33, _34;
+ FLOAT _41, _42, _43, _44;
+ };
+ FLOAT m[4][4];
+ };
+#else
+ XMVECTOR r[4];
+#endif
+
+#ifdef __cplusplus
+
+ _XMMATRIX() {};
+ _XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3);
+ _XMMATRIX(FLOAT m00, FLOAT m01, FLOAT m02, FLOAT m03,
+ FLOAT m10, FLOAT m11, FLOAT m12, FLOAT m13,
+ FLOAT m20, FLOAT m21, FLOAT m22, FLOAT m23,
+ FLOAT m30, FLOAT m31, FLOAT m32, FLOAT m33);
+ explicit _XMMATRIX(_In_count_c_(16) CONST FLOAT *pArray);
+
+#if defined(_XM_NO_INTRINSICS_) || !defined(XM_STRICT_MATRIX)
+ FLOAT operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; }
+ FLOAT& operator() (UINT Row, UINT Column) { return m[Row][Column]; }
+#endif
+
+ _XMMATRIX& operator= (CONST _XMMATRIX& M);
+
+#ifndef XM_NO_OPERATOR_OVERLOADS
+ _XMMATRIX& operator*= (CONST _XMMATRIX& M);
+ _XMMATRIX operator* (CONST _XMMATRIX& M) CONST;
+#endif // !XM_NO_OPERATOR_OVERLOADS
+
+#endif // __cplusplus
+
+} XMMATRIX;
+
+// Fix-up for XMMATRIX parameters to pass in-register on Xbox 360, by reference otherwise
+#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+typedef const XMMATRIX CXMMATRIX;
+#elif defined(__cplusplus)
+typedef const XMMATRIX& CXMMATRIX;
+#else
+typedef const XMMATRIX CXMMATRIX;
+#endif
+
+//------------------------------------------------------------------------------
+// 16 bit floating point number consisting of a sign bit, a 5 bit biased
+// exponent, and a 10 bit mantissa
+typedef USHORT HALF;
+
+//------------------------------------------------------------------------------
+// 2D Vector; 32 bit floating point components
+typedef struct _XMFLOAT2
+{
+ FLOAT x;
+ FLOAT y;
+
+#ifdef __cplusplus
+
+ _XMFLOAT2() {};
+ _XMFLOAT2(FLOAT _x, FLOAT _y) : x(_x), y(_y) {};
+ _XMFLOAT2(_In_count_c_(2) CONST FLOAT *pArray);
+
+ _XMFLOAT2& operator= (CONST _XMFLOAT2& Float2);
+
+#endif // __cplusplus
+
+} XMFLOAT2;
+
+// 2D Vector; 32 bit floating point components aligned on a 16 byte boundary
+#ifdef __cplusplus
+__declspec(align(16)) struct XMFLOAT2A : public XMFLOAT2
+{
+ XMFLOAT2A() : XMFLOAT2() {};
+ XMFLOAT2A(FLOAT _x, FLOAT _y) : XMFLOAT2(_x, _y) {};
+ XMFLOAT2A(_In_count_c_(2) CONST FLOAT *pArray) : XMFLOAT2(pArray) {};
+
+ XMFLOAT2A& operator= (CONST XMFLOAT2A& Float2);
+};
+#else
+typedef __declspec(align(16)) XMFLOAT2 XMFLOAT2A;
+#endif // __cplusplus
+
+//------------------------------------------------------------------------------
+// 2D Vector; 32 bit signed integer components
+typedef struct _XMINT2
+{
+ INT x;
+ INT y;
+
+#ifdef __cplusplus
+
+ _XMINT2() {};
+ _XMINT2(INT _x, INT _y) : x(_x), y(_y) {};
+ explicit _XMINT2(_In_count_c_(2) CONST INT *pArray);
+
+ _XMINT2& operator= (CONST _XMINT2& Int2);
+
+#endif // __cplusplus
+
+} XMINT2;
+
+// 2D Vector; 32 bit unsigned integer components
+typedef struct _XMUINT2
+{
+ UINT x;
+ UINT y;
+
+#ifdef __cplusplus
+
+ _XMUINT2() {};
+ _XMUINT2(UINT _x, UINT _y) : x(_x), y(_y) {};
+ explicit _XMUINT2(_In_count_c_(2) CONST UINT *pArray);
+
+ _XMUINT2& operator= (CONST _XMUINT2& UInt2);
+
+#endif // __cplusplus
+
+} XMUINT2;
+
+//------------------------------------------------------------------------------
+// 2D Vector; 16 bit floating point components
+typedef struct _XMHALF2
+{
+ HALF x;
+ HALF y;
+
+#ifdef __cplusplus
+
+ _XMHALF2() {};
+ _XMHALF2(HALF _x, HALF _y) : x(_x), y(_y) {};
+ explicit _XMHALF2(_In_count_c_(2) CONST HALF *pArray);
+ _XMHALF2(FLOAT _x, FLOAT _y);
+ explicit _XMHALF2(_In_count_c_(2) CONST FLOAT *pArray);
+
+ _XMHALF2& operator= (CONST _XMHALF2& Half2);
+
+#endif // __cplusplus
+
+} XMHALF2;
+
+//------------------------------------------------------------------------------
+// 2D Vector; 16 bit signed normalized integer components
+typedef struct _XMSHORTN2
+{
+ SHORT x;
+ SHORT y;
+
+#ifdef __cplusplus
+
+ _XMSHORTN2() {};
+ _XMSHORTN2(SHORT _x, SHORT _y) : x(_x), y(_y) {};
+ explicit _XMSHORTN2(_In_count_c_(2) CONST SHORT *pArray);
+ _XMSHORTN2(FLOAT _x, FLOAT _y);
+ explicit _XMSHORTN2(_In_count_c_(2) CONST FLOAT *pArray);
+
+ _XMSHORTN2& operator= (CONST _XMSHORTN2& ShortN2);
+
+#endif // __cplusplus
+
+} XMSHORTN2;
+
+// 2D Vector; 16 bit signed integer components
+typedef struct _XMSHORT2
+{
+ SHORT x;
+ SHORT y;
+
+#ifdef __cplusplus
+
+ _XMSHORT2() {};
+ _XMSHORT2(SHORT _x, SHORT _y) : x(_x), y(_y) {};
+ explicit _XMSHORT2(_In_count_c_(2) CONST SHORT *pArray);
+ _XMSHORT2(FLOAT _x, FLOAT _y);
+ explicit _XMSHORT2(_In_count_c_(2) CONST FLOAT *pArray);
+
+ _XMSHORT2& operator= (CONST _XMSHORT2& Short2);
+
+#endif // __cplusplus
+
+} XMSHORT2;
+
+// 2D Vector; 16 bit unsigned normalized integer components
+typedef struct _XMUSHORTN2
+{
+ USHORT x;
+ USHORT y;
+
+#ifdef __cplusplus
+
+ _XMUSHORTN2() {};
+ _XMUSHORTN2(USHORT _x, USHORT _y) : x(_x), y(_y) {};
+ explicit _XMUSHORTN2(_In_count_c_(2) CONST USHORT *pArray);
+ _XMUSHORTN2(FLOAT _x, FLOAT _y);
+ explicit _XMUSHORTN2(_In_count_c_(2) CONST FLOAT *pArray);
+
+ _XMUSHORTN2& operator= (CONST _XMUSHORTN2& UShortN2);
+
+#endif // __cplusplus
+
+} XMUSHORTN2;
+
+// 2D Vector; 16 bit unsigned integer components
+typedef struct _XMUSHORT2
+{
+ USHORT x;
+ USHORT y;
+
+#ifdef __cplusplus
+
+ _XMUSHORT2() {};
+ _XMUSHORT2(USHORT _x, USHORT _y) : x(_x), y(_y) {};
+ explicit _XMUSHORT2(_In_count_c_(2) CONST USHORT *pArray);
+ _XMUSHORT2(FLOAT _x, FLOAT _y);
+ explicit _XMUSHORT2(_In_count_c_(2) CONST FLOAT *pArray);
+
+ _XMUSHORT2& operator= (CONST _XMUSHORT2& UShort2);
+
+#endif // __cplusplus
+
+} XMUSHORT2;
+
+//------------------------------------------------------------------------------
+// 2D Vector; 8 bit signed normalized integer components
+typedef struct _XMBYTEN2
+{
+ CHAR x;
+ CHAR y;
+
+#ifdef __cplusplus
+
+ _XMBYTEN2() {};
+ _XMBYTEN2(CHAR _x, CHAR _y) : x(_x), y(_y) {};
+ explicit _XMBYTEN2(_In_count_c_(2) CONST CHAR *pArray);
+ _XMBYTEN2(FLOAT _x, FLOAT _y);
+ explicit _XMBYTEN2(_In_count_c_(2) CONST FLOAT *pArray);
+
+ _XMBYTEN2& operator= (CONST _XMBYTEN2& ByteN2);
+
+#endif // __cplusplus
+
+} XMBYTEN2;
+
+// 2D Vector; 8 bit signed integer components
+typedef struct _XMBYTE2
+{
+ CHAR x;
+ CHAR y;
+
+#ifdef __cplusplus
+
+ _XMBYTE2() {};
+ _XMBYTE2(CHAR _x, CHAR _y) : x(_x), y(_y) {};
+ explicit _XMBYTE2(_In_count_c_(2) CONST CHAR *pArray);
+ _XMBYTE2(FLOAT _x, FLOAT _y);
+ explicit _XMBYTE2(_In_count_c_(2) CONST FLOAT *pArray);
+
+ _XMBYTE2& operator= (CONST _XMBYTE2& Byte2);
+
+#endif // __cplusplus
+
+} XMBYTE2;
+
+// 2D Vector; 8 bit unsigned normalized integer components
+typedef struct _XMUBYTEN2
+{
+ BYTE x;
+ BYTE y;
+
+#ifdef __cplusplus
+
+ _XMUBYTEN2() {};
+ _XMUBYTEN2(BYTE _x, BYTE _y) : x(_x), y(_y) {};
+ explicit _XMUBYTEN2(_In_count_c_(2) CONST BYTE *pArray);
+ _XMUBYTEN2(FLOAT _x, FLOAT _y);
+ explicit _XMUBYTEN2(_In_count_c_(2) CONST FLOAT *pArray);
+
+ _XMUBYTEN2& operator= (CONST _XMUBYTEN2& UByteN2);
+
+#endif // __cplusplus
+
+} XMUBYTEN2;
+
+
+// 2D Vector; 8 bit unsigned integer components
+typedef struct _XMUBYTE2
+{
+ BYTE x;
+ BYTE y;
+
+#ifdef __cplusplus
+
+ _XMUBYTE2() {};
+ _XMUBYTE2(BYTE _x, BYTE _y) : x(_x), y(_y) {};
+ explicit _XMUBYTE2(_In_count_c_(2) CONST BYTE *pArray);
+ _XMUBYTE2(FLOAT _x, FLOAT _y);
+ explicit _XMUBYTE2(_In_count_c_(2) CONST FLOAT *pArray);
+
+ _XMUBYTE2& operator= (CONST _XMUBYTE2& UByte2);
+
+#endif // __cplusplus
+
+} XMUBYTE2;
+
+//------------------------------------------------------------------------------
+// 3D Vector; 32 bit floating point components
+typedef struct _XMFLOAT3
+{
+ FLOAT x;
+ FLOAT y;
+ FLOAT z;
+
+#ifdef __cplusplus
+
+ _XMFLOAT3() {};
+ _XMFLOAT3(FLOAT _x, FLOAT _y, FLOAT _z) : x(_x), y(_y), z(_z) {};
+ _XMFLOAT3(_In_count_c_(3) CONST FLOAT *pArray);
+
+ _XMFLOAT3& operator= (CONST _XMFLOAT3& Float3);
+
+#endif // __cplusplus
+
+} XMFLOAT3;
+
+// 3D Vector; 32 bit floating point components aligned on a 16 byte boundary
+#ifdef __cplusplus
+__declspec(align(16)) struct XMFLOAT3A : public XMFLOAT3
+{
+ XMFLOAT3A() : XMFLOAT3() {};
+ XMFLOAT3A(FLOAT _x, FLOAT _y, FLOAT _z) : XMFLOAT3(_x, _y, _z) {};
+ XMFLOAT3A(_In_count_c_(3) CONST FLOAT *pArray) : XMFLOAT3(pArray) {};
+
+ XMFLOAT3A& operator= (CONST XMFLOAT3A& Float3);
+};
+#else
+typedef __declspec(align(16)) XMFLOAT3 XMFLOAT3A;
+#endif // __cplusplus
+
+//------------------------------------------------------------------------------
+// 3D Vector; 32 bit signed integer components
+typedef struct _XMINT3
+{
+ INT x;
+ INT y;
+ INT z;
+
+#ifdef __cplusplus
+
+ _XMINT3() {};
+ _XMINT3(INT _x, INT _y, INT _z) : x(_x), y(_y), z(_z) {};
+ explicit _XMINT3(_In_count_c_(3) CONST INT *pArray);
+
+ _XMINT3& operator= (CONST _XMINT3& Int3);
+
+#endif // __cplusplus
+
+} XMINT3;
+
+// 3D Vector; 32 bit unsigned integer components
+typedef struct _XMUINT3
+{
+ UINT x;
+ UINT y;
+ UINT z;
+
+#ifdef __cplusplus
+
+ _XMUINT3() {};
+ _XMUINT3(UINT _x, UINT _y, UINT _z) : x(_x), y(_y), z(_z) {};
+ explicit _XMUINT3(_In_count_c_(3) CONST UINT *pArray);
+
+ _XMUINT3& operator= (CONST _XMUINT3& UInt3);
+
+#endif // __cplusplus
+
+} XMUINT3;
+
+//------------------------------------------------------------------------------
+// 3D Vector; 11-11-10 bit normalized components packed into a 32 bit integer
+// The normalized 3D Vector is packed into 32 bits as follows: a 10 bit signed,
+// normalized integer for the z component and 11 bit signed, normalized
+// integers for the x and y components. The z component is stored in the
+// most significant bits and the x component in the least significant bits
+// (Z10Y11X11): [32] zzzzzzzz zzyyyyyy yyyyyxxx xxxxxxxx [0]
+typedef struct _XMHENDN3
+{
+ union
+ {
+ struct
+ {
+ INT x : 11; // -1023/1023 to 1023/1023
+ INT y : 11; // -1023/1023 to 1023/1023
+ INT z : 10; // -511/511 to 511/511
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMHENDN3() {};
+ explicit _XMHENDN3(UINT Packed) : v(Packed) {};
+ _XMHENDN3(FLOAT _x, FLOAT _y, FLOAT _z);
+ explicit _XMHENDN3(_In_count_c_(3) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMHENDN3& operator= (CONST _XMHENDN3& HenDN3);
+ _XMHENDN3& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMHENDN3;
+
+// 3D Vector; 11-11-10 bit components packed into a 32 bit integer
+// The 3D Vector is packed into 32 bits as follows: a 10 bit signed,
+// integer for the z component and 11 bit signed integers for the
+// x and y components. The z component is stored in the
+// most significant bits and the x component in the least significant bits
+// (Z10Y11X11): [32] zzzzzzzz zzyyyyyy yyyyyxxx xxxxxxxx [0]
+typedef struct _XMHEND3
+{
+ union
+ {
+ struct
+ {
+ INT x : 11; // -1023 to 1023
+ INT y : 11; // -1023 to 1023
+ INT z : 10; // -511 to 511
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMHEND3() {};
+ explicit _XMHEND3(UINT Packed) : v(Packed) {};
+ _XMHEND3(FLOAT _x, FLOAT _y, FLOAT _z);
+ explicit _XMHEND3(_In_count_c_(3) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMHEND3& operator= (CONST _XMHEND3& HenD3);
+ _XMHEND3& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMHEND3;
+
+// 3D Vector; 11-11-10 bit normalized components packed into a 32 bit integer
+// The normalized 3D Vector is packed into 32 bits as follows: a 10 bit unsigned,
+// normalized integer for the z component and 11 bit unsigned, normalized
+// integers for the x and y components. The z component is stored in the
+// most significant bits and the x component in the least significant bits
+// (Z10Y11X11): [32] zzzzzzzz zzyyyyyy yyyyyxxx xxxxxxxx [0]
+typedef struct _XMUHENDN3
+{
+ union
+ {
+ struct
+ {
+ UINT x : 11; // 0/2047 to 2047/2047
+ UINT y : 11; // 0/2047 to 2047/2047
+ UINT z : 10; // 0/1023 to 1023/1023
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMUHENDN3() {};
+ explicit _XMUHENDN3(UINT Packed) : v(Packed) {};
+ _XMUHENDN3(FLOAT _x, FLOAT _y, FLOAT _z);
+ explicit _XMUHENDN3(_In_count_c_(3) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMUHENDN3& operator= (CONST _XMUHENDN3& UHenDN3);
+ _XMUHENDN3& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMUHENDN3;
+
+// 3D Vector; 11-11-10 bit components packed into a 32 bit integer
+// The 3D Vector is packed into 32 bits as follows: a 10 bit unsigned
+// integer for the z component and 11 bit unsigned integers
+// for the x and y components. The z component is stored in the
+// most significant bits and the x component in the least significant bits
+// (Z10Y11X11): [32] zzzzzzzz zzyyyyyy yyyyyxxx xxxxxxxx [0]
+typedef struct _XMUHEND3
+{
+ union
+ {
+ struct
+ {
+ UINT x : 11; // 0 to 2047
+ UINT y : 11; // 0 to 2047
+ UINT z : 10; // 0 to 1023
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMUHEND3() {};
+ explicit _XMUHEND3(UINT Packed) : v(Packed) {};
+ _XMUHEND3(FLOAT _x, FLOAT _y, FLOAT _z);
+ explicit _XMUHEND3(_In_count_c_(3) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMUHEND3& operator= (CONST _XMUHEND3& UHenD3);
+ _XMUHEND3& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMUHEND3;
+
+// 3D Vector; 10-11-11 bit normalized components packed into a 32 bit integer
+// The normalized 3D Vector is packed into 32 bits as follows: a 10 bit signed,
+// normalized integer for the x component and 11 bit signed, normalized
+// integers for the y and z components. The z component is stored in the
+// most significant bits and the x component in the least significant bits
+// (Z11Y11X10): [32] zzzzzzzz zzzyyyyy yyyyyyxx xxxxxxxx [0]
+typedef struct _XMDHENN3
+{
+ union
+ {
+ struct
+ {
+ INT x : 10; // -511/511 to 511/511
+ INT y : 11; // -1023/1023 to 1023/1023
+ INT z : 11; // -1023/1023 to 1023/1023
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMDHENN3() {};
+ explicit _XMDHENN3(UINT Packed) : v(Packed) {};
+ _XMDHENN3(FLOAT _x, FLOAT _y, FLOAT _z);
+ explicit _XMDHENN3(_In_count_c_(3) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMDHENN3& operator= (CONST _XMDHENN3& DHenN3);
+ _XMDHENN3& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMDHENN3;
+
+// 3D Vector; 10-11-11 bit components packed into a 32 bit integer
+// The 3D Vector is packed into 32 bits as follows: a 10 bit signed,
+// integer for the x component and 11 bit signed integers for the
+// y and z components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (Z11Y11X10): [32] zzzzzzzz zzzyyyyy yyyyyyxx xxxxxxxx [0]
+typedef struct _XMDHEN3
+{
+ union
+ {
+ struct
+ {
+ INT x : 10; // -511 to 511
+ INT y : 11; // -1023 to 1023
+ INT z : 11; // -1023 to 1023
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMDHEN3() {};
+ explicit _XMDHEN3(UINT Packed) : v(Packed) {};
+ _XMDHEN3(FLOAT _x, FLOAT _y, FLOAT _z);
+ explicit _XMDHEN3(_In_count_c_(3) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMDHEN3& operator= (CONST _XMDHEN3& DHen3);
+ _XMDHEN3& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMDHEN3;
+
+// 3D Vector; 10-11-11 bit normalized components packed into a 32 bit integer
+// The normalized 3D Vector is packed into 32 bits as follows: a 10 bit unsigned,
+// normalized integer for the x component and 11 bit unsigned, normalized
+// integers for the y and z components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (Z11Y11X10): [32] zzzzzzzz zzzyyyyy yyyyyyxx xxxxxxxx [0]
+typedef struct _XMUDHENN3
+{
+ union
+ {
+ struct
+ {
+ UINT x : 10; // 0/1023 to 1023/1023
+ UINT y : 11; // 0/2047 to 2047/2047
+ UINT z : 11; // 0/2047 to 2047/2047
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMUDHENN3() {};
+ explicit _XMUDHENN3(UINT Packed) : v(Packed) {};
+ _XMUDHENN3(FLOAT _x, FLOAT _y, FLOAT _z);
+ explicit _XMUDHENN3(_In_count_c_(3) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMUDHENN3& operator= (CONST _XMUDHENN3& UDHenN3);
+ _XMUDHENN3& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMUDHENN3;
+
+// 3D Vector; 10-11-11 bit components packed into a 32 bit integer
+// The 3D Vector is packed into 32 bits as follows: a 10 bit unsigned,
+// integer for the x component and 11 bit unsigned integers
+// for the y and z components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (Z11Y11X10): [32] zzzzzzzz zzzyyyyy yyyyyyxx xxxxxxxx [0]
+typedef struct _XMUDHEN3
+{
+ union
+ {
+ struct
+ {
+ UINT x : 10; // 0 to 1023
+ UINT y : 11; // 0 to 2047
+ UINT z : 11; // 0 to 2047
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMUDHEN3() {};
+ explicit _XMUDHEN3(UINT Packed) : v(Packed) {};
+ _XMUDHEN3(FLOAT _x, FLOAT _y, FLOAT _z);
+ explicit _XMUDHEN3(_In_count_c_(3) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMUDHEN3& operator= (CONST _XMUDHEN3& UDHen3);
+ _XMUDHEN3& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMUDHEN3;
+
+//------------------------------------------------------------------------------
+// 3D vector: 5/6/5 unsigned integer components
+typedef struct _XMU565
+{
+ union
+ {
+ struct
+ {
+ USHORT x : 5;
+ USHORT y : 6;
+ USHORT z : 5;
+ };
+ USHORT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMU565() {};
+ explicit _XMU565(USHORT Packed) : v(Packed) {};
+ _XMU565(CHAR _x, CHAR _y, CHAR _z) : x(_x), y(_y), z(_z) {};
+ explicit _XMU565(_In_count_c_(3) CONST CHAR *pArray);
+ _XMU565(FLOAT _x, FLOAT _y, FLOAT _z);
+ explicit _XMU565(_In_count_c_(3) CONST FLOAT *pArray);
+
+ operator USHORT () const { return v; }
+
+ _XMU565& operator= (CONST _XMU565& U565);
+ _XMU565& operator= (CONST USHORT Packed);
+
+#endif // __cplusplus
+
+} XMU565;
+
+//------------------------------------------------------------------------------
+// 3D vector: 11/11/10 floating-point components
+// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent
+// and 6-bit mantissa for x component, a 5-bit biased exponent and
+// 6-bit mantissa for y component, a 5-bit biased exponent and a 5-bit
+// mantissa for z. The z component is stored in the most significant bits
+// and the x component in the least significant bits. No sign bits so
+// all partial-precision numbers are positive.
+// (Z10Y11X11): [32] ZZZZZzzz zzzYYYYY yyyyyyXX XXXxxxxx [0]
+typedef struct _XMFLOAT3PK
+{
+ union
+ {
+ struct
+ {
+ UINT xm : 6;
+ UINT xe : 5;
+ UINT ym : 6;
+ UINT ye : 5;
+ UINT zm : 5;
+ UINT ze : 5;
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMFLOAT3PK() {};
+ explicit _XMFLOAT3PK(UINT Packed) : v(Packed) {};
+ _XMFLOAT3PK(FLOAT _x, FLOAT _y, FLOAT _z);
+ explicit _XMFLOAT3PK(_In_count_c_(3) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMFLOAT3PK& operator= (CONST _XMFLOAT3PK& float3pk);
+ _XMFLOAT3PK& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMFLOAT3PK;
+
+//------------------------------------------------------------------------------
+// 3D vector: 9/9/9 floating-point components with shared 5-bit exponent
+// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent
+// with 9-bit mantissa for the x, y, and z component. The shared exponent
+// is stored in the most significant bits and the x component mantissa is in
+// the least significant bits. No sign bits so all partial-precision numbers
+// are positive.
+// (E5Z9Y9X9): [32] EEEEEzzz zzzzzzyy yyyyyyyx xxxxxxxx [0]
+typedef struct _XMFLOAT3SE
+{
+ union
+ {
+ struct
+ {
+ UINT xm : 9;
+ UINT ym : 9;
+ UINT zm : 9;
+ UINT e : 5;
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMFLOAT3SE() {};
+ explicit _XMFLOAT3SE(UINT Packed) : v(Packed) {};
+ _XMFLOAT3SE(FLOAT _x, FLOAT _y, FLOAT _z);
+ explicit _XMFLOAT3SE(_In_count_c_(3) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMFLOAT3SE& operator= (CONST _XMFLOAT3SE& float3se);
+ _XMFLOAT3SE& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMFLOAT3SE;
+
+//------------------------------------------------------------------------------
+// 4D Vector; 32 bit floating point components
+typedef struct _XMFLOAT4
+{
+ FLOAT x;
+ FLOAT y;
+ FLOAT z;
+ FLOAT w;
+
+#ifdef __cplusplus
+
+ _XMFLOAT4() {};
+ _XMFLOAT4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w) : x(_x), y(_y), z(_z), w(_w) {};
+ _XMFLOAT4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ _XMFLOAT4& operator= (CONST _XMFLOAT4& Float4);
+
+#endif // __cplusplus
+
+} XMFLOAT4;
+
+// 4D Vector; 32 bit floating point components aligned on a 16 byte boundary
+#ifdef __cplusplus
+__declspec(align(16)) struct XMFLOAT4A : public XMFLOAT4
+{
+ XMFLOAT4A() : XMFLOAT4() {};
+ XMFLOAT4A(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w) : XMFLOAT4(_x, _y, _z, _w) {};
+ XMFLOAT4A(_In_count_c_(4) CONST FLOAT *pArray) : XMFLOAT4(pArray) {};
+
+ XMFLOAT4A& operator= (CONST XMFLOAT4A& Float4);
+};
+#else
+typedef __declspec(align(16)) XMFLOAT4 XMFLOAT4A;
+#endif // __cplusplus
+
+//------------------------------------------------------------------------------
+// 4D Vector; 32 bit signed integer components
+typedef struct _XMINT4
+{
+ INT x;
+ INT y;
+ INT z;
+ INT w;
+
+#ifdef __cplusplus
+
+ _XMINT4() {};
+ _XMINT4(INT _x, INT _y, INT _z, INT _w) : x(_x), y(_y), z(_z), w(_w) {};
+ explicit _XMINT4(_In_count_c_(4) CONST INT *pArray);
+
+ _XMINT4& operator= (CONST _XMINT4& Int4);
+
+#endif // __cplusplus
+
+} XMINT4;
+
+// 4D Vector; 32 bit unsigned integer components
+typedef struct _XMUINT4
+{
+ UINT x;
+ UINT y;
+ UINT z;
+ UINT w;
+
+#ifdef __cplusplus
+
+ _XMUINT4() {};
+ _XMUINT4(UINT _x, UINT _y, UINT _z, UINT _w) : x(_x), y(_y), z(_z), w(_w) {};
+ explicit _XMUINT4(_In_count_c_(4) CONST UINT *pArray);
+
+ _XMUINT4& operator= (CONST _XMUINT4& UInt4);
+
+#endif // __cplusplus
+
+} XMUINT4;
+
+//------------------------------------------------------------------------------
+// 4D Vector; 16 bit floating point components
+typedef struct _XMHALF4
+{
+ HALF x;
+ HALF y;
+ HALF z;
+ HALF w;
+
+#ifdef __cplusplus
+
+ _XMHALF4() {};
+ _XMHALF4(HALF _x, HALF _y, HALF _z, HALF _w) : x(_x), y(_y), z(_z), w(_w) {};
+ explicit _XMHALF4(_In_count_c_(4) CONST HALF *pArray);
+ _XMHALF4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMHALF4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ _XMHALF4& operator= (CONST _XMHALF4& Half4);
+
+#endif // __cplusplus
+
+} XMHALF4;
+
+//------------------------------------------------------------------------------
+// 4D Vector; 16 bit signed normalized integer components
+typedef struct _XMSHORTN4
+{
+ SHORT x;
+ SHORT y;
+ SHORT z;
+ SHORT w;
+
+#ifdef __cplusplus
+
+ _XMSHORTN4() {};
+ _XMSHORTN4(SHORT _x, SHORT _y, SHORT _z, SHORT _w) : x(_x), y(_y), z(_z), w(_w) {};
+ explicit _XMSHORTN4(_In_count_c_(4) CONST SHORT *pArray);
+ _XMSHORTN4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMSHORTN4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ _XMSHORTN4& operator= (CONST _XMSHORTN4& ShortN4);
+
+#endif // __cplusplus
+
+} XMSHORTN4;
+
+// 4D Vector; 16 bit signed integer components
+typedef struct _XMSHORT4
+{
+ SHORT x;
+ SHORT y;
+ SHORT z;
+ SHORT w;
+
+#ifdef __cplusplus
+
+ _XMSHORT4() {};
+ _XMSHORT4(SHORT _x, SHORT _y, SHORT _z, SHORT _w) : x(_x), y(_y), z(_z), w(_w) {};
+ explicit _XMSHORT4(_In_count_c_(4) CONST SHORT *pArray);
+ _XMSHORT4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMSHORT4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ _XMSHORT4& operator= (CONST _XMSHORT4& Short4);
+
+#endif // __cplusplus
+
+} XMSHORT4;
+
+// 4D Vector; 16 bit unsigned normalized integer components
+typedef struct _XMUSHORTN4
+{
+ USHORT x;
+ USHORT y;
+ USHORT z;
+ USHORT w;
+
+#ifdef __cplusplus
+
+ _XMUSHORTN4() {};
+ _XMUSHORTN4(USHORT _x, USHORT _y, USHORT _z, USHORT _w) : x(_x), y(_y), z(_z), w(_w) {};
+ explicit _XMUSHORTN4(_In_count_c_(4) CONST USHORT *pArray);
+ _XMUSHORTN4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMUSHORTN4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ _XMUSHORTN4& operator= (CONST _XMUSHORTN4& UShortN4);
+
+#endif // __cplusplus
+
+} XMUSHORTN4;
+
+// 4D Vector; 16 bit unsigned integer components
+typedef struct _XMUSHORT4
+{
+ USHORT x;
+ USHORT y;
+ USHORT z;
+ USHORT w;
+
+#ifdef __cplusplus
+
+ _XMUSHORT4() {};
+ _XMUSHORT4(USHORT _x, USHORT _y, USHORT _z, USHORT _w) : x(_x), y(_y), z(_z), w(_w) {};
+ explicit _XMUSHORT4(_In_count_c_(4) CONST USHORT *pArray);
+ _XMUSHORT4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMUSHORT4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ _XMUSHORT4& operator= (CONST _XMUSHORT4& UShort4);
+
+#endif // __cplusplus
+
+} XMUSHORT4;
+
+//------------------------------------------------------------------------------
+// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
+// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned,
+// normalized integer for the w component and 10 bit signed, normalized
+// integers for the z, y, and x components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+typedef struct _XMXDECN4
+{
+ union
+ {
+ struct
+ {
+ INT x : 10; // -511/511 to 511/511
+ INT y : 10; // -511/511 to 511/511
+ INT z : 10; // -511/511 to 511/511
+ UINT w : 2; // 0/3 to 3/3
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMXDECN4() {};
+ explicit _XMXDECN4(UINT Packed) : v(Packed) {};
+ _XMXDECN4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMXDECN4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMXDECN4& operator= (CONST _XMXDECN4& XDecN4);
+ _XMXDECN4& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMXDECN4;
+
+// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
+// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned
+// integer for the w component and 10 bit signed integers for the
+// z, y, and x components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+typedef struct _XMXDEC4
+{
+ union
+ {
+ struct
+ {
+ INT x : 10; // -511 to 511
+ INT y : 10; // -511 to 511
+ INT z : 10; // -511 to 511
+ UINT w : 2; // 0 to 3
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMXDEC4() {};
+ explicit _XMXDEC4(UINT Packed) : v(Packed) {};
+ _XMXDEC4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMXDEC4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMXDEC4& operator= (CONST _XMXDEC4& XDec4);
+ _XMXDEC4& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMXDEC4;
+
+// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
+// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit signed,
+// normalized integer for the w component and 10 bit signed, normalized
+// integers for the z, y, and x components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+typedef struct _XMDECN4
+{
+ union
+ {
+ struct
+ {
+ INT x : 10; // -511/511 to 511/511
+ INT y : 10; // -511/511 to 511/511
+ INT z : 10; // -511/511 to 511/511
+ INT w : 2; // -1/1 to 1/1
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMDECN4() {};
+ explicit _XMDECN4(UINT Packed) : v(Packed) {};
+ _XMDECN4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMDECN4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMDECN4& operator= (CONST _XMDECN4& DecN4);
+ _XMDECN4& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMDECN4;
+
+// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
+// The 4D Vector is packed into 32 bits as follows: a 2 bit signed,
+// integer for the w component and 10 bit signed integers for the
+// z, y, and x components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+typedef struct _XMDEC4
+{
+ union
+ {
+ struct
+ {
+ INT x : 10; // -511 to 511
+ INT y : 10; // -511 to 511
+ INT z : 10; // -511 to 511
+ INT w : 2; // -1 to 1
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMDEC4() {};
+ explicit _XMDEC4(UINT Packed) : v(Packed) {};
+ _XMDEC4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMDEC4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMDEC4& operator= (CONST _XMDEC4& Dec4);
+ _XMDEC4& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMDEC4;
+
+// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
+// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned,
+// normalized integer for the w component and 10 bit unsigned, normalized
+// integers for the z, y, and x components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+typedef struct _XMUDECN4
+{
+ union
+ {
+ struct
+ {
+ UINT x : 10; // 0/1023 to 1023/1023
+ UINT y : 10; // 0/1023 to 1023/1023
+ UINT z : 10; // 0/1023 to 1023/1023
+ UINT w : 2; // 0/3 to 3/3
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMUDECN4() {};
+ explicit _XMUDECN4(UINT Packed) : v(Packed) {};
+ _XMUDECN4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMUDECN4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMUDECN4& operator= (CONST _XMUDECN4& UDecN4);
+ _XMUDECN4& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMUDECN4;
+
+// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
+// The 4D Vector is packed into 32 bits as follows: a 2 bit unsigned,
+// integer for the w component and 10 bit unsigned integers
+// for the z, y, and x components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+typedef struct _XMUDEC4
+{
+ union
+ {
+ struct
+ {
+ UINT x : 10; // 0 to 1023
+ UINT y : 10; // 0 to 1023
+ UINT z : 10; // 0 to 1023
+ UINT w : 2; // 0 to 3
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMUDEC4() {};
+ explicit _XMUDEC4(UINT Packed) : v(Packed) {};
+ _XMUDEC4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMUDEC4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ operator UINT () const { return v; }
+
+ _XMUDEC4& operator= (CONST _XMUDEC4& UDec4);
+ _XMUDEC4& operator= (CONST UINT Packed);
+
+#endif // __cplusplus
+
+} XMUDEC4;
+
+//------------------------------------------------------------------------------
+// 4D Vector; 20-20-20-4 bit normalized components packed into a 64 bit integer
+// The normalized 4D Vector is packed into 64 bits as follows: a 4 bit unsigned,
+// normalized integer for the w component and 20 bit signed, normalized
+// integers for the z, y, and x components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (W4Z20Y20X20): [64] wwwwzzzz zzzzzzzz zzzzzzzz yyyyyyyy yyyyyyyy yyyyxxxx xxxxxxxx xxxxxxxx [0]
+typedef struct _XMXICON4
+{
+ union
+ {
+ struct
+ {
+ INT64 x : 20; // -524287/524287 to 524287/524287
+ INT64 y : 20; // -524287/524287 to 524287/524287
+ INT64 z : 20; // -524287/524287 to 524287/524287
+ UINT64 w : 4; // 0/15 to 15/15
+ };
+ UINT64 v;
+ };
+
+#ifdef __cplusplus
+
+ _XMXICON4() {};
+ explicit _XMXICON4(UINT64 Packed) : v(Packed) {};
+ _XMXICON4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMXICON4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ operator UINT64 () const { return v; }
+
+ _XMXICON4& operator= (CONST _XMXICON4& XIcoN4);
+ _XMXICON4& operator= (CONST UINT64 Packed);
+
+#endif // __cplusplus
+
+} XMXICON4;
+
+// 4D Vector; 20-20-20-4 bit components packed into a 64 bit integer
+// The 4D Vector is packed into 64 bits as follows: a 4 bit unsigned
+// integer for the w component and 20 bit signed integers for the
+// z, y, and x components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (W4Z20Y20X20): [64] wwwwzzzz zzzzzzzz zzzzzzzz yyyyyyyy yyyyyyyy yyyyxxxx xxxxxxxx xxxxxxxx [0]
+typedef struct _XMXICO4
+{
+ union
+ {
+ struct
+ {
+ INT64 x : 20; // -524287 to 524287
+ INT64 y : 20; // -524287 to 524287
+ INT64 z : 20; // -524287 to 524287
+ UINT64 w : 4; // 0 to 15
+ };
+ UINT64 v;
+ };
+
+#ifdef __cplusplus
+
+ _XMXICO4() {};
+ explicit _XMXICO4(UINT64 Packed) : v(Packed) {};
+ _XMXICO4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMXICO4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ operator UINT64 () const { return v; }
+
+ _XMXICO4& operator= (CONST _XMXICO4& XIco4);
+ _XMXICO4& operator= (CONST UINT64 Packed);
+
+#endif // __cplusplus
+
+} XMXICO4;
+
+// 4D Vector; 20-20-20-4 bit normalized components packed into a 64 bit integer
+// The normalized 4D Vector is packed into 64 bits as follows: a 4 bit signed,
+// normalized integer for the w component and 20 bit signed, normalized
+// integers for the z, y, and x components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (W4Z20Y20X20): [64] wwwwzzzz zzzzzzzz zzzzzzzz yyyyyyyy yyyyyyyy yyyyxxxx xxxxxxxx xxxxxxxx [0]
+typedef struct _XMICON4
+{
+ union
+ {
+ struct
+ {
+ INT64 x : 20; // -524287/524287 to 524287/524287
+ INT64 y : 20; // -524287/524287 to 524287/524287
+ INT64 z : 20; // -524287/524287 to 524287/524287
+ INT64 w : 4; // -7/7 to 7/7
+ };
+ UINT64 v;
+ };
+
+#ifdef __cplusplus
+
+ _XMICON4() {};
+ explicit _XMICON4(UINT64 Packed) : v(Packed) {};
+ _XMICON4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMICON4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ operator UINT64 () const { return v; }
+
+ _XMICON4& operator= (CONST _XMICON4& IcoN4);
+ _XMICON4& operator= (CONST UINT64 Packed);
+
+#endif // __cplusplus
+
+} XMICON4;
+
+// 4D Vector; 20-20-20-4 bit components packed into a 64 bit integer
+// The 4D Vector is packed into 64 bits as follows: a 4 bit signed,
+// integer for the w component and 20 bit signed integers for the
+// z, y, and x components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (W4Z20Y20X20): [64] wwwwzzzz zzzzzzzz zzzzzzzz yyyyyyyy yyyyyyyy yyyyxxxx xxxxxxxx xxxxxxxx [0]
+typedef struct _XMICO4
+{
+ union
+ {
+ struct
+ {
+ INT64 x : 20; // -524287 to 524287
+ INT64 y : 20; // -524287 to 524287
+ INT64 z : 20; // -524287 to 524287
+ INT64 w : 4; // -7 to 7
+ };
+ UINT64 v;
+ };
+
+#ifdef __cplusplus
+
+ _XMICO4() {};
+ explicit _XMICO4(UINT64 Packed) : v(Packed) {};
+ _XMICO4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMICO4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ operator UINT64 () const { return v; }
+
+ _XMICO4& operator= (CONST _XMICO4& Ico4);
+ _XMICO4& operator= (CONST UINT64 Packed);
+
+#endif // __cplusplus
+
+} XMICO4;
+
+// 4D Vector; 20-20-20-4 bit normalized components packed into a 64 bit integer
+// The normalized 4D Vector is packed into 64 bits as follows: a 4 bit unsigned,
+// normalized integer for the w component and 20 bit unsigned, normalized
+// integers for the z, y, and x components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (W4Z20Y20X20): [64] wwwwzzzz zzzzzzzz zzzzzzzz yyyyyyyy yyyyyyyy yyyyxxxx xxxxxxxx xxxxxxxx [0]
+typedef struct _XMUICON4
+{
+ union
+ {
+ struct
+ {
+ UINT64 x : 20; // 0/1048575 to 1048575/1048575
+ UINT64 y : 20; // 0/1048575 to 1048575/1048575
+ UINT64 z : 20; // 0/1048575 to 1048575/1048575
+ UINT64 w : 4; // 0/15 to 15/15
+ };
+ UINT64 v;
+ };
+
+#ifdef __cplusplus
+
+ _XMUICON4() {};
+ explicit _XMUICON4(UINT64 Packed) : v(Packed) {};
+ _XMUICON4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMUICON4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ operator UINT64 () const { return v; }
+
+ _XMUICON4& operator= (CONST _XMUICON4& UIcoN4);
+ _XMUICON4& operator= (CONST UINT64 Packed);
+
+#endif // __cplusplus
+
+} XMUICON4;
+
+// 4D Vector; 20-20-20-4 bit components packed into a 64 bit integer
+// The 4D Vector is packed into 64 bits as follows: a 4 bit unsigned
+// integer for the w component and 20 bit unsigned integers for the
+// z, y, and x components. The w component is stored in the
+// most significant bits and the x component in the least significant bits
+// (W4Z20Y20X20): [64] wwwwzzzz zzzzzzzz zzzzzzzz yyyyyyyy yyyyyyyy yyyyxxxx xxxxxxxx xxxxxxxx [0]
+typedef struct _XMUICO4
+{
+ union
+ {
+ struct
+ {
+ UINT64 x : 20; // 0 to 1048575
+ UINT64 y : 20; // 0 to 1048575
+ UINT64 z : 20; // 0 to 1048575
+ UINT64 w : 4; // 0 to 15
+ };
+ UINT64 v;
+ };
+
+#ifdef __cplusplus
+
+ _XMUICO4() {};
+ explicit _XMUICO4(UINT64 Packed) : v(Packed) {};
+ _XMUICO4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMUICO4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ operator UINT64 () const { return v; }
+
+ _XMUICO4& operator= (CONST _XMUICO4& UIco4);
+ _XMUICO4& operator= (CONST UINT64 Packed);
+
+#endif // __cplusplus
+
+} XMUICO4;
+
+//------------------------------------------------------------------------------
+// ARGB Color; 8-8-8-8 bit unsigned normalized integer components packed into
+// a 32 bit integer. The normalized color is packed into 32 bits using 8 bit
+// unsigned, normalized integers for the alpha, red, green, and blue components.
+// The alpha component is stored in the most significant bits and the blue
+// component in the least significant bits (A8R8G8B8):
+// [32] aaaaaaaa rrrrrrrr gggggggg bbbbbbbb [0]
+typedef struct _XMCOLOR
+{
+ union
+ {
+ struct
+ {
+ UINT b : 8; // Blue: 0/255 to 255/255
+ UINT g : 8; // Green: 0/255 to 255/255
+ UINT r : 8; // Red: 0/255 to 255/255
+ UINT a : 8; // Alpha: 0/255 to 255/255
+ };
+ UINT c;
+ };
+
+#ifdef __cplusplus
+
+ _XMCOLOR() {};
+ _XMCOLOR(UINT Color) : c(Color) {};
+ _XMCOLOR(FLOAT _r, FLOAT _g, FLOAT _b, FLOAT _a);
+ explicit _XMCOLOR(_In_count_c_(4) CONST FLOAT *pArray);
+
+ operator UINT () const { return c; }
+
+ _XMCOLOR& operator= (CONST _XMCOLOR& Color);
+ _XMCOLOR& operator= (CONST UINT Color);
+
+#endif // __cplusplus
+
+} XMCOLOR;
+
+//------------------------------------------------------------------------------
+// 4D Vector; 8 bit signed normalized integer components
+typedef struct _XMBYTEN4
+{
+ union
+ {
+ struct
+ {
+ CHAR x;
+ CHAR y;
+ CHAR z;
+ CHAR w;
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMBYTEN4() {};
+ _XMBYTEN4(CHAR _x, CHAR _y, CHAR _z, CHAR _w) : x(_x), y(_y), z(_z), w(_w) {};
+ explicit _XMBYTEN4(UINT Packed) : v(Packed) {};
+ explicit _XMBYTEN4(_In_count_c_(4) CONST CHAR *pArray);
+ _XMBYTEN4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMBYTEN4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ _XMBYTEN4& operator= (CONST _XMBYTEN4& ByteN4);
+ _XMBYTEN4& operator= (UINT Packed) { v = Packed; return *this; }
+
+#endif // __cplusplus
+
+} XMBYTEN4;
+
+// 4D Vector; 8 bit signed integer components
+typedef struct _XMBYTE4
+{
+ union
+ {
+ struct
+ {
+ CHAR x;
+ CHAR y;
+ CHAR z;
+ CHAR w;
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMBYTE4() {};
+ _XMBYTE4(CHAR _x, CHAR _y, CHAR _z, CHAR _w) : x(_x), y(_y), z(_z), w(_w) {};
+ explicit _XMBYTE4(UINT Packed) : v(Packed) {};
+ explicit _XMBYTE4(_In_count_c_(4) CONST CHAR *pArray);
+ _XMBYTE4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMBYTE4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ _XMBYTE4& operator= (CONST _XMBYTE4& Byte4);
+ _XMBYTE4& operator= (UINT Packed) { v = Packed; return *this; }
+
+#endif // __cplusplus
+
+} XMBYTE4;
+
+// 4D Vector; 8 bit unsigned normalized integer components
+typedef struct _XMUBYTEN4
+{
+ union
+ {
+ struct
+ {
+ BYTE x;
+ BYTE y;
+ BYTE z;
+ BYTE w;
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMUBYTEN4() {};
+ _XMUBYTEN4(BYTE _x, BYTE _y, BYTE _z, BYTE _w) : x(_x), y(_y), z(_z), w(_w) {};
+ explicit _XMUBYTEN4(UINT Packed) : v(Packed) {};
+ explicit _XMUBYTEN4(_In_count_c_(4) CONST BYTE *pArray);
+ _XMUBYTEN4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMUBYTEN4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ _XMUBYTEN4& operator= (CONST _XMUBYTEN4& UByteN4);
+ _XMUBYTEN4& operator= (UINT Packed) { v = Packed; return *this; }
+
+#endif // __cplusplus
+
+} XMUBYTEN4;
+
+// 4D Vector; 8 bit unsigned integer components
+typedef struct _XMUBYTE4
+{
+ union
+ {
+ struct
+ {
+ BYTE x;
+ BYTE y;
+ BYTE z;
+ BYTE w;
+ };
+ UINT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMUBYTE4() {};
+ _XMUBYTE4(BYTE _x, BYTE _y, BYTE _z, BYTE _w) : x(_x), y(_y), z(_z), w(_w) {};
+ explicit _XMUBYTE4(UINT Packed) : v(Packed) {};
+ explicit _XMUBYTE4(_In_count_c_(4) CONST BYTE *pArray);
+ _XMUBYTE4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMUBYTE4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ _XMUBYTE4& operator= (CONST _XMUBYTE4& UByte4);
+ _XMUBYTE4& operator= (UINT Packed) { v = Packed; return *this; }
+
+#endif // __cplusplus
+
+} XMUBYTE4;
+
+//------------------------------------------------------------------------------
+// 4D vector; 4 bit unsigned integer components
+typedef struct _XMUNIBBLE4
+{
+ union
+ {
+ struct
+ {
+ USHORT x : 4;
+ USHORT y : 4;
+ USHORT z : 4;
+ USHORT w : 4;
+ };
+ USHORT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMUNIBBLE4() {};
+ explicit _XMUNIBBLE4(USHORT Packed) : v(Packed) {};
+ _XMUNIBBLE4(CHAR _x, CHAR _y, CHAR _z, CHAR _w) : x(_x), y(_y), z(_z), w(_w) {};
+ explicit _XMUNIBBLE4(_In_count_c_(4) CONST CHAR *pArray);
+ _XMUNIBBLE4(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w);
+ explicit _XMUNIBBLE4(_In_count_c_(4) CONST FLOAT *pArray);
+
+ operator USHORT () const { return v; }
+
+ _XMUNIBBLE4& operator= (CONST _XMUNIBBLE4& UNibble4);
+ _XMUNIBBLE4& operator= (CONST USHORT Packed);
+
+#endif // __cplusplus
+
+} XMUNIBBLE4;
+
+//------------------------------------------------------------------------------
+// 4D vector: 5/5/5/1 unsigned integer components
+typedef struct _XMU555
+{
+ union
+ {
+ struct
+ {
+ USHORT x : 5;
+ USHORT y : 5;
+ USHORT z : 5;
+ USHORT w : 1;
+ };
+ USHORT v;
+ };
+
+#ifdef __cplusplus
+
+ _XMU555() {};
+ explicit _XMU555(USHORT Packed) : v(Packed) {};
+ _XMU555(CHAR _x, CHAR _y, CHAR _z, BOOL _w) : x(_x), y(_y), z(_z), w(_w ? 0x1 : 0) {};
+ _XMU555(_In_count_c_(3) CONST CHAR *pArray, BOOL _w);
+ _XMU555(FLOAT _x, FLOAT _y, FLOAT _z, BOOL _w);
+ _XMU555(_In_count_c_(3) CONST FLOAT *pArray, BOOL _w);
+
+ operator USHORT () const { return v; }
+
+ _XMU555& operator= (CONST _XMU555& U555);
+ _XMU555& operator= (CONST USHORT Packed);
+
+#endif // __cplusplus
+
+} XMU555;
+
+//------------------------------------------------------------------------------
+// 3x3 Matrix: 32 bit floating point components
+typedef struct _XMFLOAT3X3
+{
+ union
+ {
+ struct
+ {
+ FLOAT _11, _12, _13;
+ FLOAT _21, _22, _23;
+ FLOAT _31, _32, _33;
+ };
+ FLOAT m[3][3];
+ };
+
+#ifdef __cplusplus
+
+ _XMFLOAT3X3() {};
+ _XMFLOAT3X3(FLOAT m00, FLOAT m01, FLOAT m02,
+ FLOAT m10, FLOAT m11, FLOAT m12,
+ FLOAT m20, FLOAT m21, FLOAT m22);
+ explicit _XMFLOAT3X3(_In_count_c_(9) CONST FLOAT *pArray);
+
+ FLOAT operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; }
+ FLOAT& operator() (UINT Row, UINT Column) { return m[Row][Column]; }
+
+ _XMFLOAT3X3& operator= (CONST _XMFLOAT3X3& Float3x3);
+
+#endif // __cplusplus
+
+} XMFLOAT3X3;
+
+//------------------------------------------------------------------------------
+// 4x3 Matrix: 32 bit floating point components
+typedef struct _XMFLOAT4X3
+{
+ union
+ {
+ struct
+ {
+ FLOAT _11, _12, _13;
+ FLOAT _21, _22, _23;
+ FLOAT _31, _32, _33;
+ FLOAT _41, _42, _43;
+ };
+ FLOAT m[4][3];
+ };
+
+#ifdef __cplusplus
+
+ _XMFLOAT4X3() {};
+ _XMFLOAT4X3(FLOAT m00, FLOAT m01, FLOAT m02,
+ FLOAT m10, FLOAT m11, FLOAT m12,
+ FLOAT m20, FLOAT m21, FLOAT m22,
+ FLOAT m30, FLOAT m31, FLOAT m32);
+ explicit _XMFLOAT4X3(_In_count_c_(12) CONST FLOAT *pArray);
+
+ FLOAT operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; }
+ FLOAT& operator() (UINT Row, UINT Column) { return m[Row][Column]; }
+
+ _XMFLOAT4X3& operator= (CONST _XMFLOAT4X3& Float4x3);
+
+#endif // __cplusplus
+
+} XMFLOAT4X3;
+
+// 4x3 Matrix: 32 bit floating point components aligned on a 16 byte boundary
+#ifdef __cplusplus
+__declspec(align(16)) struct XMFLOAT4X3A : public XMFLOAT4X3
+{
+ XMFLOAT4X3A() : XMFLOAT4X3() {};
+ XMFLOAT4X3A(FLOAT m00, FLOAT m01, FLOAT m02,
+ FLOAT m10, FLOAT m11, FLOAT m12,
+ FLOAT m20, FLOAT m21, FLOAT m22,
+ FLOAT m30, FLOAT m31, FLOAT m32) :
+ XMFLOAT4X3(m00,m01,m02,m10,m11,m12,m20,m21,m22,m30,m31,m32) {};
+ explicit XMFLOAT4X3A(_In_count_c_(12) CONST FLOAT *pArray) : XMFLOAT4X3(pArray) {}
+
+ FLOAT operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; }
+ FLOAT& operator() (UINT Row, UINT Column) { return m[Row][Column]; }
+
+ XMFLOAT4X3A& operator= (CONST XMFLOAT4X3A& Float4x3);
+};
+#else
+typedef __declspec(align(16)) XMFLOAT4X3 XMFLOAT4X3A;
+#endif // __cplusplus
+
+//------------------------------------------------------------------------------
+// 4x4 Matrix: 32 bit floating point components
+typedef struct _XMFLOAT4X4
+{
+ union
+ {
+ struct
+ {
+ FLOAT _11, _12, _13, _14;
+ FLOAT _21, _22, _23, _24;
+ FLOAT _31, _32, _33, _34;
+ FLOAT _41, _42, _43, _44;
+ };
+ FLOAT m[4][4];
+ };
+
+#ifdef __cplusplus
+
+ _XMFLOAT4X4() {};
+ _XMFLOAT4X4(FLOAT m00, FLOAT m01, FLOAT m02, FLOAT m03,
+ FLOAT m10, FLOAT m11, FLOAT m12, FLOAT m13,
+ FLOAT m20, FLOAT m21, FLOAT m22, FLOAT m23,
+ FLOAT m30, FLOAT m31, FLOAT m32, FLOAT m33);
+ explicit _XMFLOAT4X4(_In_count_c_(16) CONST FLOAT *pArray);
+
+ FLOAT operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; }
+ FLOAT& operator() (UINT Row, UINT Column) { return m[Row][Column]; }
+
+ _XMFLOAT4X4& operator= (CONST _XMFLOAT4X4& Float4x4);
+
+#endif // __cplusplus
+
+} XMFLOAT4X4;
+
+// 4x4 Matrix: 32 bit floating point components aligned on a 16 byte boundary
+#ifdef __cplusplus
+__declspec(align(16)) struct XMFLOAT4X4A : public XMFLOAT4X4
+{
+ XMFLOAT4X4A() : XMFLOAT4X4() {};
+ XMFLOAT4X4A(FLOAT m00, FLOAT m01, FLOAT m02, FLOAT m03,
+ FLOAT m10, FLOAT m11, FLOAT m12, FLOAT m13,
+ FLOAT m20, FLOAT m21, FLOAT m22, FLOAT m23,
+ FLOAT m30, FLOAT m31, FLOAT m32, FLOAT m33)
+ : XMFLOAT4X4(m00,m01,m02,m03,m10,m11,m12,m13,m20,m21,m22,m23,m30,m31,m32,m33) {};
+ explicit XMFLOAT4X4A(_In_count_c_(16) CONST FLOAT *pArray) : XMFLOAT4X4(pArray) {}
+
+ FLOAT operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; }
+ FLOAT& operator() (UINT Row, UINT Column) { return m[Row][Column]; }
+
+ XMFLOAT4X4A& operator= (CONST XMFLOAT4X4A& Float4x4);
+};
+#else
+typedef __declspec(align(16)) XMFLOAT4X4 XMFLOAT4X4A;
+#endif // __cplusplus
+
+
+#ifdef _XM_BIGENDIAN_
+#pragma bitfield_order(pop)
+#endif
+
+#pragma warning(pop)
+
+/****************************************************************************
+ *
+ * Data conversion operations
+ *
+ ****************************************************************************/
+
+#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_VMX128_INTRINSICS_)
+#else
+XMVECTOR XMConvertVectorIntToFloat(FXMVECTOR VInt, UINT DivExponent);
+XMVECTOR XMConvertVectorFloatToInt(FXMVECTOR VFloat, UINT MulExponent);
+XMVECTOR XMConvertVectorUIntToFloat(FXMVECTOR VUInt, UINT DivExponent);
+XMVECTOR XMConvertVectorFloatToUInt(FXMVECTOR VFloat, UINT MulExponent);
+#endif
+
+FLOAT XMConvertHalfToFloat(HALF Value);
+FLOAT* XMConvertHalfToFloatStream(_Out_bytecap_x_(sizeof(FLOAT)+OutputStride*(HalfCount-1)) FLOAT* pOutputStream,
+ _In_ size_t OutputStride,
+ _In_bytecount_x_(sizeof(HALF)+InputStride*(HalfCount-1)) CONST HALF* pInputStream,
+ _In_ size_t InputStride, _In_ size_t HalfCount);
+HALF XMConvertFloatToHalf(FLOAT Value);
+HALF* XMConvertFloatToHalfStream(_Out_bytecap_x_(sizeof(HALF)+OutputStride*(FloatCount-1)) HALF* pOutputStream,
+ _In_ size_t OutputStride,
+ _In_bytecount_x_(sizeof(FLOAT)+InputStride*(FloatCount-1)) CONST FLOAT* pInputStream,
+ _In_ size_t InputStride, _In_ size_t FloatCount);
+
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
+XMVECTOR XMVectorSetBinaryConstant(UINT C0, UINT C1, UINT C2, UINT C3);
+XMVECTOR XMVectorSplatConstant(INT IntConstant, UINT DivExponent);
+XMVECTOR XMVectorSplatConstantInt(INT IntConstant);
+
+// VMX128 versions defined below as macros
+#endif
+
+/****************************************************************************
+ *
+ * Load operations
+ *
+ ****************************************************************************/
+
+XMVECTOR XMLoadInt(_In_ CONST UINT* pSource);
+XMVECTOR XMLoadFloat(_In_ CONST FLOAT* pSource);
+
+XMVECTOR XMLoadInt2(_In_count_c_(2) CONST UINT* pSource);
+XMVECTOR XMLoadInt2A(_In_count_c_(2) CONST UINT* PSource);
+XMVECTOR XMLoadFloat2(_In_ CONST XMFLOAT2* pSource);
+XMVECTOR XMLoadFloat2A(_In_ CONST XMFLOAT2A* pSource);
+XMVECTOR XMLoadSInt2(_In_ CONST XMINT2* pSource);
+XMVECTOR XMLoadUInt2(_In_ CONST XMUINT2* pSource);
+XMVECTOR XMLoadHalf2(_In_ CONST XMHALF2* pSource);
+XMVECTOR XMLoadShortN2(_In_ CONST XMSHORTN2* pSource);
+XMVECTOR XMLoadShort2(_In_ CONST XMSHORT2* pSource);
+XMVECTOR XMLoadUShortN2(_In_ CONST XMUSHORTN2* pSource);
+XMVECTOR XMLoadUShort2(_In_ CONST XMUSHORT2* pSource);
+XMVECTOR XMLoadByteN2(_In_ CONST XMBYTEN2* pSource);
+XMVECTOR XMLoadByte2(_In_ CONST XMBYTE2* pSource);
+XMVECTOR XMLoadUByteN2(_In_ CONST XMUBYTEN2* pSource);
+XMVECTOR XMLoadUByte2(_In_ CONST XMUBYTE2* pSource);
+
+XMVECTOR XMLoadInt3(_In_count_c_(3) CONST UINT* pSource);
+XMVECTOR XMLoadInt3A(_In_count_c_(3) CONST UINT* pSource);
+XMVECTOR XMLoadFloat3(_In_ CONST XMFLOAT3* pSource);
+XMVECTOR XMLoadFloat3A(_In_ CONST XMFLOAT3A* pSource);
+XMVECTOR XMLoadSInt3(_In_ CONST XMINT3* pSource);
+XMVECTOR XMLoadUInt3(_In_ CONST XMUINT3* pSource);
+XMVECTOR XMLoadHenDN3(_In_ CONST XMHENDN3* pSource);
+XMVECTOR XMLoadHenD3(_In_ CONST XMHEND3* pSource);
+XMVECTOR XMLoadUHenDN3(_In_ CONST XMUHENDN3* pSource);
+XMVECTOR XMLoadUHenD3(_In_ CONST XMUHEND3* pSource);
+XMVECTOR XMLoadDHenN3(_In_ CONST XMDHENN3* pSource);
+XMVECTOR XMLoadDHen3(_In_ CONST XMDHEN3* pSource);
+XMVECTOR XMLoadUDHenN3(_In_ CONST XMUDHENN3* pSource);
+XMVECTOR XMLoadUDHen3(_In_ CONST XMUDHEN3* pSource);
+XMVECTOR XMLoadU565(_In_ CONST XMU565* pSource);
+XMVECTOR XMLoadFloat3PK(_In_ CONST XMFLOAT3PK* pSource);
+XMVECTOR XMLoadFloat3SE(_In_ CONST XMFLOAT3SE* pSource);
+
+XMVECTOR XMLoadInt4(_In_count_c_(4) CONST UINT* pSource);
+XMVECTOR XMLoadInt4A(_In_count_c_(4) CONST UINT* pSource);
+XMVECTOR XMLoadFloat4(_In_ CONST XMFLOAT4* pSource);
+XMVECTOR XMLoadFloat4A(_In_ CONST XMFLOAT4A* pSource);
+XMVECTOR XMLoadSInt4(_In_ CONST XMINT4* pSource);
+XMVECTOR XMLoadUInt4(_In_ CONST XMUINT4* pSource);
+XMVECTOR XMLoadHalf4(_In_ CONST XMHALF4* pSource);
+XMVECTOR XMLoadShortN4(_In_ CONST XMSHORTN4* pSource);
+XMVECTOR XMLoadShort4(_In_ CONST XMSHORT4* pSource);
+XMVECTOR XMLoadUShortN4(_In_ CONST XMUSHORTN4* pSource);
+XMVECTOR XMLoadUShort4(_In_ CONST XMUSHORT4* pSource);
+XMVECTOR XMLoadXIcoN4(_In_ CONST XMXICON4* pSource);
+XMVECTOR XMLoadXIco4(_In_ CONST XMXICO4* pSource);
+XMVECTOR XMLoadIcoN4(_In_ CONST XMICON4* pSource);
+XMVECTOR XMLoadIco4(_In_ CONST XMICO4* pSource);
+XMVECTOR XMLoadUIcoN4(_In_ CONST XMUICON4* pSource);
+XMVECTOR XMLoadUIco4(_In_ CONST XMUICO4* pSource);
+XMVECTOR XMLoadXDecN4(_In_ CONST XMXDECN4* pSource);
+XMVECTOR XMLoadXDec4(_In_ CONST XMXDEC4* pSource);
+XMVECTOR XMLoadDecN4(_In_ CONST XMDECN4* pSource);
+XMVECTOR XMLoadDec4(_In_ CONST XMDEC4* pSource);
+XMVECTOR XMLoadUDecN4(_In_ CONST XMUDECN4* pSource);
+XMVECTOR XMLoadUDec4(_In_ CONST XMUDEC4* pSource);
+XMVECTOR XMLoadByteN4(_In_ CONST XMBYTEN4* pSource);
+XMVECTOR XMLoadByte4(_In_ CONST XMBYTE4* pSource);
+XMVECTOR XMLoadUByteN4(_In_ CONST XMUBYTEN4* pSource);
+XMVECTOR XMLoadUByte4(_In_ CONST XMUBYTE4* pSource);
+XMVECTOR XMLoadUNibble4(_In_ CONST XMUNIBBLE4* pSource);
+XMVECTOR XMLoadU555(_In_ CONST XMU555* pSource);
+XMVECTOR XMLoadColor(_In_ CONST XMCOLOR* pSource);
+
+XMMATRIX XMLoadFloat3x3(_In_ CONST XMFLOAT3X3* pSource);
+XMMATRIX XMLoadFloat4x3(_In_ CONST XMFLOAT4X3* pSource);
+XMMATRIX XMLoadFloat4x3A(_In_ CONST XMFLOAT4X3A* pSource);
+XMMATRIX XMLoadFloat4x4(_In_ CONST XMFLOAT4X4* pSource);
+XMMATRIX XMLoadFloat4x4A(_In_ CONST XMFLOAT4X4A* pSource);
+
+/****************************************************************************
+ *
+ * Store operations
+ *
+ ****************************************************************************/
+
+VOID XMStoreInt(_Out_ UINT* pDestination, FXMVECTOR V);
+VOID XMStoreFloat(_Out_ FLOAT* pDestination, FXMVECTOR V);
+
+VOID XMStoreInt2(_Out_cap_c_(2) UINT* pDestination, FXMVECTOR V);
+VOID XMStoreInt2A(_Out_cap_c_(2) UINT* pDestination, FXMVECTOR V);
+VOID XMStoreFloat2(_Out_ XMFLOAT2* pDestination, FXMVECTOR V);
+VOID XMStoreFloat2A(_Out_ XMFLOAT2A* pDestination, FXMVECTOR V);
+VOID XMStoreSInt2(_Out_ XMINT2* pDestination, FXMVECTOR V);
+VOID XMStoreUInt2(_Out_ XMUINT2* pDestination, FXMVECTOR V);
+VOID XMStoreHalf2(_Out_ XMHALF2* pDestination, FXMVECTOR V);
+VOID XMStoreShortN2(_Out_ XMSHORTN2* pDestination, FXMVECTOR V);
+VOID XMStoreShort2(_Out_ XMSHORT2* pDestination, FXMVECTOR V);
+VOID XMStoreUShortN2(_Out_ XMUSHORTN2* pDestination, FXMVECTOR V);
+VOID XMStoreUShort2(_Out_ XMUSHORT2* pDestination, FXMVECTOR V);
+VOID XMStoreByteN2(_Out_ XMBYTEN2* pDestination, FXMVECTOR V);
+VOID XMStoreByte2(_Out_ XMBYTE2* pDestination, FXMVECTOR V);
+VOID XMStoreUByteN2(_Out_ XMUBYTEN2* pDestination, FXMVECTOR V);
+VOID XMStoreUByte2(_Out_ XMUBYTE2* pDestination, FXMVECTOR V);
+
+VOID XMStoreInt3(_Out_cap_c_(3) UINT* pDestination, FXMVECTOR V);
+VOID XMStoreInt3A(_Out_cap_c_(3) UINT* pDestination, FXMVECTOR V);
+VOID XMStoreFloat3(_Out_ XMFLOAT3* pDestination, FXMVECTOR V);
+VOID XMStoreFloat3A(_Out_ XMFLOAT3A* pDestination, FXMVECTOR V);
+VOID XMStoreSInt3(_Out_ XMINT3* pDestination, FXMVECTOR V);
+VOID XMStoreUInt3(_Out_ XMUINT3* pDestination, FXMVECTOR V);
+VOID XMStoreHenDN3(_Out_ XMHENDN3* pDestination, FXMVECTOR V);
+VOID XMStoreHenD3(_Out_ XMHEND3* pDestination, FXMVECTOR V);
+VOID XMStoreUHenDN3(_Out_ XMUHENDN3* pDestination, FXMVECTOR V);
+VOID XMStoreUHenD3(_Out_ XMUHEND3* pDestination, FXMVECTOR V);
+VOID XMStoreDHenN3(_Out_ XMDHENN3* pDestination, FXMVECTOR V);
+VOID XMStoreDHen3(_Out_ XMDHEN3* pDestination, FXMVECTOR V);
+VOID XMStoreUDHenN3(_Out_ XMUDHENN3* pDestination, FXMVECTOR V);
+VOID XMStoreUDHen3(_Out_ XMUDHEN3* pDestination, FXMVECTOR V);
+VOID XMStoreU565(_Out_ XMU565* pDestination, FXMVECTOR V);
+VOID XMStoreFloat3PK(_Out_ XMFLOAT3PK* pDestination, FXMVECTOR V);
+VOID XMStoreFloat3SE(_Out_ XMFLOAT3SE* pDestination, FXMVECTOR V);
+
+VOID XMStoreInt4(_Out_cap_c_(4) UINT* pDestination, FXMVECTOR V);
+VOID XMStoreInt4A(_Out_cap_c_(4) UINT* pDestination, FXMVECTOR V);
+VOID XMStoreInt4NC(_Out_cap_c_(4) UINT* pDestination, FXMVECTOR V);
+VOID XMStoreFloat4(_Out_ XMFLOAT4* pDestination, FXMVECTOR V);
+VOID XMStoreFloat4A(_Out_ XMFLOAT4A* pDestination, FXMVECTOR V);
+VOID XMStoreFloat4NC(_Out_ XMFLOAT4* pDestination, FXMVECTOR V);
+VOID XMStoreSInt4(_Out_ XMINT4* pDestination, FXMVECTOR V);
+VOID XMStoreUInt4(_Out_ XMUINT4* pDestination, FXMVECTOR V);
+VOID XMStoreHalf4(_Out_ XMHALF4* pDestination, FXMVECTOR V);
+VOID XMStoreShortN4(_Out_ XMSHORTN4* pDestination, FXMVECTOR V);
+VOID XMStoreShort4(_Out_ XMSHORT4* pDestination, FXMVECTOR V);
+VOID XMStoreUShortN4(_Out_ XMUSHORTN4* pDestination, FXMVECTOR V);
+VOID XMStoreUShort4(_Out_ XMUSHORT4* pDestination, FXMVECTOR V);
+VOID XMStoreXIcoN4(_Out_ XMXICON4* pDestination, FXMVECTOR V);
+VOID XMStoreXIco4(_Out_ XMXICO4* pDestination, FXMVECTOR V);
+VOID XMStoreIcoN4(_Out_ XMICON4* pDestination, FXMVECTOR V);
+VOID XMStoreIco4(_Out_ XMICO4* pDestination, FXMVECTOR V);
+VOID XMStoreUIcoN4(_Out_ XMUICON4* pDestination, FXMVECTOR V);
+VOID XMStoreUIco4(_Out_ XMUICO4* pDestination, FXMVECTOR V);
+VOID XMStoreXDecN4(_Out_ XMXDECN4* pDestination, FXMVECTOR V);
+VOID XMStoreXDec4(_Out_ XMXDEC4* pDestination, FXMVECTOR V);
+VOID XMStoreDecN4(_Out_ XMDECN4* pDestination, FXMVECTOR V);
+VOID XMStoreDec4(_Out_ XMDEC4* pDestination, FXMVECTOR V);
+VOID XMStoreUDecN4(_Out_ XMUDECN4* pDestination, FXMVECTOR V);
+VOID XMStoreUDec4(_Out_ XMUDEC4* pDestination, FXMVECTOR V);
+VOID XMStoreByteN4(_Out_ XMBYTEN4* pDestination, FXMVECTOR V);
+VOID XMStoreByte4(_Out_ XMBYTE4* pDestination, FXMVECTOR V);
+VOID XMStoreUByteN4(_Out_ XMUBYTEN4* pDestination, FXMVECTOR V);
+VOID XMStoreUByte4(_Out_ XMUBYTE4* pDestination, FXMVECTOR V);
+VOID XMStoreUNibble4(_Out_ XMUNIBBLE4* pDestination, FXMVECTOR V);
+VOID XMStoreU555(_Out_ XMU555* pDestination, FXMVECTOR V);
+VOID XMStoreColor(_Out_ XMCOLOR* pDestination, FXMVECTOR V);
+
+VOID XMStoreFloat3x3(_Out_ XMFLOAT3X3* pDestination, CXMMATRIX M);
+VOID XMStoreFloat3x3NC(_Out_ XMFLOAT3X3* pDestination, CXMMATRIX M);
+VOID XMStoreFloat4x3(_Out_ XMFLOAT4X3* pDestination, CXMMATRIX M);
+VOID XMStoreFloat4x3A(_Out_ XMFLOAT4X3A* pDestination, CXMMATRIX M);
+VOID XMStoreFloat4x3NC(_Out_ XMFLOAT4X3* pDestination, CXMMATRIX M);
+VOID XMStoreFloat4x4(_Out_ XMFLOAT4X4* pDestination, CXMMATRIX M);
+VOID XMStoreFloat4x4A(_Out_ XMFLOAT4X4A* pDestination, CXMMATRIX M);
+VOID XMStoreFloat4x4NC(_Out_ XMFLOAT4X4* pDestination, CXMMATRIX M);
+
+/****************************************************************************
+ *
+ * General vector operations
+ *
+ ****************************************************************************/
+
+XMVECTOR XMVectorZero();
+XMVECTOR XMVectorSet(FLOAT x, FLOAT y, FLOAT z, FLOAT w);
+XMVECTOR XMVectorSetInt(UINT x, UINT y, UINT z, UINT w);
+XMVECTOR XMVectorReplicate(FLOAT Value);
+XMVECTOR XMVectorReplicatePtr(_In_ CONST FLOAT *pValue);
+XMVECTOR XMVectorReplicateInt(UINT Value);
+XMVECTOR XMVectorReplicateIntPtr(_In_ CONST UINT *pValue);
+XMVECTOR XMVectorTrueInt();
+XMVECTOR XMVectorFalseInt();
+XMVECTOR XMVectorSplatX(FXMVECTOR V);
+XMVECTOR XMVectorSplatY(FXMVECTOR V);
+XMVECTOR XMVectorSplatZ(FXMVECTOR V);
+XMVECTOR XMVectorSplatW(FXMVECTOR V);
+XMVECTOR XMVectorSplatOne();
+XMVECTOR XMVectorSplatInfinity();
+XMVECTOR XMVectorSplatQNaN();
+XMVECTOR XMVectorSplatEpsilon();
+XMVECTOR XMVectorSplatSignMask();
+
+FLOAT XMVectorGetByIndex(FXMVECTOR V,UINT i);
+FLOAT XMVectorGetX(FXMVECTOR V);
+FLOAT XMVectorGetY(FXMVECTOR V);
+FLOAT XMVectorGetZ(FXMVECTOR V);
+FLOAT XMVectorGetW(FXMVECTOR V);
+
+VOID XMVectorGetByIndexPtr(_Out_ FLOAT *f, FXMVECTOR V, UINT i);
+VOID XMVectorGetXPtr(_Out_ FLOAT *x, FXMVECTOR V);
+VOID XMVectorGetYPtr(_Out_ FLOAT *y, FXMVECTOR V);
+VOID XMVectorGetZPtr(_Out_ FLOAT *z, FXMVECTOR V);
+VOID XMVectorGetWPtr(_Out_ FLOAT *w, FXMVECTOR V);
+
+UINT XMVectorGetIntByIndex(FXMVECTOR V,UINT i);
+UINT XMVectorGetIntX(FXMVECTOR V);
+UINT XMVectorGetIntY(FXMVECTOR V);
+UINT XMVectorGetIntZ(FXMVECTOR V);
+UINT XMVectorGetIntW(FXMVECTOR V);
+
+VOID XMVectorGetIntByIndexPtr(_Out_ UINT *x,FXMVECTOR V, UINT i);
+VOID XMVectorGetIntXPtr(_Out_ UINT *x, FXMVECTOR V);
+VOID XMVectorGetIntYPtr(_Out_ UINT *y, FXMVECTOR V);
+VOID XMVectorGetIntZPtr(_Out_ UINT *z, FXMVECTOR V);
+VOID XMVectorGetIntWPtr(_Out_ UINT *w, FXMVECTOR V);
+
+XMVECTOR XMVectorSetByIndex(FXMVECTOR V,FLOAT f,UINT i);
+XMVECTOR XMVectorSetX(FXMVECTOR V, FLOAT x);
+XMVECTOR XMVectorSetY(FXMVECTOR V, FLOAT y);
+XMVECTOR XMVectorSetZ(FXMVECTOR V, FLOAT z);
+XMVECTOR XMVectorSetW(FXMVECTOR V, FLOAT w);
+
+XMVECTOR XMVectorSetByIndexPtr(FXMVECTOR V, _In_ CONST FLOAT *f, UINT i);
+XMVECTOR XMVectorSetXPtr(FXMVECTOR V, _In_ CONST FLOAT *x);
+XMVECTOR XMVectorSetYPtr(FXMVECTOR V, _In_ CONST FLOAT *y);
+XMVECTOR XMVectorSetZPtr(FXMVECTOR V, _In_ CONST FLOAT *z);
+XMVECTOR XMVectorSetWPtr(FXMVECTOR V, _In_ CONST FLOAT *w);
+
+XMVECTOR XMVectorSetIntByIndex(FXMVECTOR V, UINT x,UINT i);
+XMVECTOR XMVectorSetIntX(FXMVECTOR V, UINT x);
+XMVECTOR XMVectorSetIntY(FXMVECTOR V, UINT y);
+XMVECTOR XMVectorSetIntZ(FXMVECTOR V, UINT z);
+XMVECTOR XMVectorSetIntW(FXMVECTOR V, UINT w);
+
+XMVECTOR XMVectorSetIntByIndexPtr(FXMVECTOR V, _In_ CONST UINT *x, UINT i);
+XMVECTOR XMVectorSetIntXPtr(FXMVECTOR V, _In_ CONST UINT *x);
+XMVECTOR XMVectorSetIntYPtr(FXMVECTOR V, _In_ CONST UINT *y);
+XMVECTOR XMVectorSetIntZPtr(FXMVECTOR V, _In_ CONST UINT *z);
+XMVECTOR XMVectorSetIntWPtr(FXMVECTOR V, _In_ CONST UINT *w);
+
+XMVECTOR XMVectorPermuteControl(UINT ElementIndex0, UINT ElementIndex1, UINT ElementIndex2, UINT ElementIndex3);
+XMVECTOR XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control);
+XMVECTOR XMVectorSelectControl(UINT VectorIndex0, UINT VectorIndex1, UINT VectorIndex2, UINT VectorIndex3);
+XMVECTOR XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control);
+XMVECTOR XMVectorMergeXY(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorMergeZW(FXMVECTOR V1, FXMVECTOR V2);
+
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
+XMVECTOR XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, UINT Elements);
+XMVECTOR XMVectorRotateLeft(FXMVECTOR V, UINT Elements);
+XMVECTOR XMVectorRotateRight(FXMVECTOR V, UINT Elements);
+XMVECTOR XMVectorSwizzle(FXMVECTOR V, UINT E0, UINT E1, UINT E2, UINT E3);
+XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, UINT VSLeftRotateElements,
+ UINT Select0, UINT Select1, UINT Select2, UINT Select3);
+
+// VMX128 versions defined below as macros
+#endif
+
+XMVECTOR XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorEqualR(_Out_ UINT* pCR, FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorEqualInt(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorEqualIntR(_Out_ UINT* pCR, FXMVECTOR V, FXMVECTOR V2);
+XMVECTOR XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon);
+XMVECTOR XMVectorNotEqual(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorNotEqualInt(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorGreater(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorGreaterR(_Out_ UINT* pCR, FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorGreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorGreaterOrEqualR(_Out_ UINT* pCR, FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorLess(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorLessOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorInBounds(FXMVECTOR V, FXMVECTOR Bounds);
+XMVECTOR XMVectorInBoundsR(_Out_ UINT* pCR, FXMVECTOR V, FXMVECTOR Bounds);
+
+XMVECTOR XMVectorIsNaN(FXMVECTOR V);
+XMVECTOR XMVectorIsInfinite(FXMVECTOR V);
+
+XMVECTOR XMVectorMin(FXMVECTOR V1,FXMVECTOR V2);
+XMVECTOR XMVectorMax(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorRound(FXMVECTOR V);
+XMVECTOR XMVectorTruncate(FXMVECTOR V);
+XMVECTOR XMVectorFloor(FXMVECTOR V);
+XMVECTOR XMVectorCeiling(FXMVECTOR V);
+XMVECTOR XMVectorClamp(FXMVECTOR V, FXMVECTOR Min, FXMVECTOR Max);
+XMVECTOR XMVectorSaturate(FXMVECTOR V);
+
+XMVECTOR XMVectorAndInt(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorAndCInt(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorNorInt(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2);
+
+XMVECTOR XMVectorNegate(FXMVECTOR V);
+XMVECTOR XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorSubtractAngles(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorMultiply(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3);
+XMVECTOR XMVectorDivide(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorNegativeMultiplySubtract(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3);
+XMVECTOR XMVectorScale(FXMVECTOR V, FLOAT ScaleFactor);
+XMVECTOR XMVectorReciprocalEst(FXMVECTOR V);
+XMVECTOR XMVectorReciprocal(FXMVECTOR V);
+XMVECTOR XMVectorSqrtEst(FXMVECTOR V);
+XMVECTOR XMVectorSqrt(FXMVECTOR V);
+XMVECTOR XMVectorReciprocalSqrtEst(FXMVECTOR V);
+XMVECTOR XMVectorReciprocalSqrt(FXMVECTOR V);
+XMVECTOR XMVectorExpEst(FXMVECTOR V);
+XMVECTOR XMVectorExp(FXMVECTOR V);
+XMVECTOR XMVectorLogEst(FXMVECTOR V);
+XMVECTOR XMVectorLog(FXMVECTOR V);
+XMVECTOR XMVectorPowEst(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorPow(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorAbs(FXMVECTOR V);
+XMVECTOR XMVectorMod(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVectorModAngles(FXMVECTOR Angles);
+XMVECTOR XMVectorSin(FXMVECTOR V);
+XMVECTOR XMVectorSinEst(FXMVECTOR V);
+XMVECTOR XMVectorCos(FXMVECTOR V);
+XMVECTOR XMVectorCosEst(FXMVECTOR V);
+VOID XMVectorSinCos(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, FXMVECTOR V);
+VOID XMVectorSinCosEst(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, FXMVECTOR V);
+XMVECTOR XMVectorTan(FXMVECTOR V);
+XMVECTOR XMVectorTanEst(FXMVECTOR V);
+XMVECTOR XMVectorSinH(FXMVECTOR V);
+XMVECTOR XMVectorSinHEst(FXMVECTOR V);
+XMVECTOR XMVectorCosH(FXMVECTOR V);
+XMVECTOR XMVectorCosHEst(FXMVECTOR V);
+XMVECTOR XMVectorTanH(FXMVECTOR V);
+XMVECTOR XMVectorTanHEst(FXMVECTOR V);
+XMVECTOR XMVectorASin(FXMVECTOR V);
+XMVECTOR XMVectorASinEst(FXMVECTOR V);
+XMVECTOR XMVectorACos(FXMVECTOR V);
+XMVECTOR XMVectorACosEst(FXMVECTOR V);
+XMVECTOR XMVectorATan(FXMVECTOR V);
+XMVECTOR XMVectorATanEst(FXMVECTOR V);
+XMVECTOR XMVectorATan2(FXMVECTOR Y, FXMVECTOR X);
+XMVECTOR XMVectorATan2Est(FXMVECTOR Y, FXMVECTOR X);
+XMVECTOR XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1, FLOAT t);
+XMVECTOR XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR T);
+XMVECTOR XMVectorHermite(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, CXMVECTOR Tangent1, FLOAT t);
+XMVECTOR XMVectorHermiteV(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, CXMVECTOR Tangent1, CXMVECTOR T);
+XMVECTOR XMVectorCatmullRom(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, CXMVECTOR Position3, FLOAT t);
+XMVECTOR XMVectorCatmullRomV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, CXMVECTOR Position3, CXMVECTOR T);
+XMVECTOR XMVectorBaryCentric(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, FLOAT f, FLOAT g);
+XMVECTOR XMVectorBaryCentricV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, CXMVECTOR F, CXMVECTOR G);
+
+/****************************************************************************
+ *
+ * 2D vector operations
+ *
+ ****************************************************************************/
+
+
+BOOL XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2);
+UINT XMVector2EqualR(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2);
+UINT XMVector2EqualIntR(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon);
+BOOL XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector2NotEqualInt(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2);
+UINT XMVector2GreaterR(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector2GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+UINT XMVector2GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector2Less(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector2LessOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector2InBounds(FXMVECTOR V, FXMVECTOR Bounds);
+UINT XMVector2InBoundsR(FXMVECTOR V, FXMVECTOR Bounds);
+
+BOOL XMVector2IsNaN(FXMVECTOR V);
+BOOL XMVector2IsInfinite(FXMVECTOR V);
+
+XMVECTOR XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVector2Cross(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVector2LengthSq(FXMVECTOR V);
+XMVECTOR XMVector2ReciprocalLengthEst(FXMVECTOR V);
+XMVECTOR XMVector2ReciprocalLength(FXMVECTOR V);
+XMVECTOR XMVector2LengthEst(FXMVECTOR V);
+XMVECTOR XMVector2Length(FXMVECTOR V);
+XMVECTOR XMVector2NormalizeEst(FXMVECTOR V);
+XMVECTOR XMVector2Normalize(FXMVECTOR V);
+XMVECTOR XMVector2ClampLength(FXMVECTOR V, FLOAT LengthMin, FLOAT LengthMax);
+XMVECTOR XMVector2ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax);
+XMVECTOR XMVector2Reflect(FXMVECTOR Incident, FXMVECTOR Normal);
+XMVECTOR XMVector2Refract(FXMVECTOR Incident, FXMVECTOR Normal, FLOAT RefractionIndex);
+XMVECTOR XMVector2RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex);
+XMVECTOR XMVector2Orthogonal(FXMVECTOR V);
+XMVECTOR XMVector2AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2);
+XMVECTOR XMVector2AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2);
+XMVECTOR XMVector2AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVector2LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point);
+XMVECTOR XMVector2IntersectLine(FXMVECTOR Line1Point1, FXMVECTOR Line1Point2, FXMVECTOR Line2Point1, CXMVECTOR Line2Point2);
+XMVECTOR XMVector2Transform(FXMVECTOR V, CXMMATRIX M);
+XMFLOAT4* XMVector2TransformStream(_Out_bytecap_x_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream,
+ _In_ size_t OutputStride,
+ _In_bytecount_x_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) CONST XMFLOAT2* pInputStream,
+ _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M);
+XMFLOAT4* XMVector2TransformStreamNC(_Out_bytecap_x_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream,
+ _In_ size_t OutputStride,
+ _In_bytecount_x_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) CONST XMFLOAT2* pInputStream,
+ _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M);
+XMVECTOR XMVector2TransformCoord(FXMVECTOR V, CXMMATRIX M);
+XMFLOAT2* XMVector2TransformCoordStream(_Out_bytecap_x_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream,
+ _In_ size_t OutputStride,
+ _In_bytecount_x_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) CONST XMFLOAT2* pInputStream,
+ _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M);
+XMVECTOR XMVector2TransformNormal(FXMVECTOR V, CXMMATRIX M);
+XMFLOAT2* XMVector2TransformNormalStream(_Out_bytecap_x_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream,
+ _In_ size_t OutputStride,
+ _In_bytecount_x_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) CONST XMFLOAT2* pInputStream,
+ _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M);
+
+/****************************************************************************
+ *
+ * 3D vector operations
+ *
+ ****************************************************************************/
+
+
+BOOL XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2);
+UINT XMVector3EqualR(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2);
+UINT XMVector3EqualIntR(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon);
+BOOL XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector3NotEqualInt(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2);
+UINT XMVector3GreaterR(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector3GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+UINT XMVector3GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector3Less(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector3LessOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector3InBounds(FXMVECTOR V, FXMVECTOR Bounds);
+UINT XMVector3InBoundsR(FXMVECTOR V, FXMVECTOR Bounds);
+
+BOOL XMVector3IsNaN(FXMVECTOR V);
+BOOL XMVector3IsInfinite(FXMVECTOR V);
+
+XMVECTOR XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVector3Cross(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVector3LengthSq(FXMVECTOR V);
+XMVECTOR XMVector3ReciprocalLengthEst(FXMVECTOR V);
+XMVECTOR XMVector3ReciprocalLength(FXMVECTOR V);
+XMVECTOR XMVector3LengthEst(FXMVECTOR V);
+XMVECTOR XMVector3Length(FXMVECTOR V);
+XMVECTOR XMVector3NormalizeEst(FXMVECTOR V);
+XMVECTOR XMVector3Normalize(FXMVECTOR V);
+XMVECTOR XMVector3ClampLength(FXMVECTOR V, FLOAT LengthMin, FLOAT LengthMax);
+XMVECTOR XMVector3ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax);
+XMVECTOR XMVector3Reflect(FXMVECTOR Incident, FXMVECTOR Normal);
+XMVECTOR XMVector3Refract(FXMVECTOR Incident, FXMVECTOR Normal, FLOAT RefractionIndex);
+XMVECTOR XMVector3RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex);
+XMVECTOR XMVector3Orthogonal(FXMVECTOR V);
+XMVECTOR XMVector3AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2);
+XMVECTOR XMVector3AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2);
+XMVECTOR XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVector3LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point);
+VOID XMVector3ComponentsFromNormal(_Out_ XMVECTOR* pParallel, _Out_ XMVECTOR* pPerpendicular, FXMVECTOR V, FXMVECTOR Normal);
+XMVECTOR XMVector3Rotate(FXMVECTOR V, FXMVECTOR RotationQuaternion);
+XMVECTOR XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion);
+XMVECTOR XMVector3Transform(FXMVECTOR V, CXMMATRIX M);
+XMFLOAT4* XMVector3TransformStream(_Out_bytecap_x_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream,
+ _In_ size_t OutputStride,
+ _In_bytecount_x_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) CONST XMFLOAT3* pInputStream,
+ _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M);
+XMFLOAT4* XMVector3TransformStreamNC(_Out_bytecap_x_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream,
+ _In_ size_t OutputStride,
+ _In_bytecount_x_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) CONST XMFLOAT3* pInputStream,
+ _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M);
+XMVECTOR XMVector3TransformCoord(FXMVECTOR V, CXMMATRIX M);
+XMFLOAT3* XMVector3TransformCoordStream(_Out_bytecap_x_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream,
+ _In_ size_t OutputStride,
+ _In_bytecount_x_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) CONST XMFLOAT3* pInputStream,
+ _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M);
+XMVECTOR XMVector3TransformNormal(FXMVECTOR V, CXMMATRIX M);
+XMFLOAT3* XMVector3TransformNormalStream(_Out_bytecap_x_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream,
+ _In_ size_t OutputStride,
+ _In_bytecount_x_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) CONST XMFLOAT3* pInputStream,
+ _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M);
+XMVECTOR XMVector3Project(FXMVECTOR V, FLOAT ViewportX, FLOAT ViewportY, FLOAT ViewportWidth, FLOAT ViewportHeight, FLOAT ViewportMinZ, FLOAT ViewportMaxZ,
+ CXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World);
+XMFLOAT3* XMVector3ProjectStream(_Out_bytecap_x_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream,
+ _In_ size_t OutputStride,
+ _In_bytecount_x_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) CONST XMFLOAT3* pInputStream,
+ _In_ size_t InputStride, _In_ size_t VectorCount,
+ FLOAT ViewportX, FLOAT ViewportY, FLOAT ViewportWidth, FLOAT ViewportHeight, FLOAT ViewportMinZ, FLOAT ViewportMaxZ,
+ CXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World);
+XMVECTOR XMVector3Unproject(FXMVECTOR V, FLOAT ViewportX, FLOAT ViewportY, FLOAT ViewportWidth, FLOAT ViewportHeight, FLOAT ViewportMinZ, FLOAT ViewportMaxZ,
+ CXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World);
+XMFLOAT3* XMVector3UnprojectStream(_Out_bytecap_x_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream,
+ _In_ size_t OutputStride,
+ _In_bytecount_x_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) CONST XMFLOAT3* pInputStream,
+ _In_ size_t InputStride, _In_ size_t VectorCount,
+ FLOAT ViewportX, FLOAT ViewportY, FLOAT ViewportWidth, FLOAT ViewportHeight, FLOAT ViewportMinZ, FLOAT ViewportMaxZ,
+ CXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World);
+
+/****************************************************************************
+ *
+ * 4D vector operations
+ *
+ ****************************************************************************/
+
+BOOL XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2);
+UINT XMVector4EqualR(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2);
+UINT XMVector4EqualIntR(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon);
+BOOL XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector4NotEqualInt(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2);
+UINT XMVector4GreaterR(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector4GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+UINT XMVector4GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector4Less(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector4LessOrEqual(FXMVECTOR V1, FXMVECTOR V2);
+BOOL XMVector4InBounds(FXMVECTOR V, FXMVECTOR Bounds);
+UINT XMVector4InBoundsR(FXMVECTOR V, FXMVECTOR Bounds);
+
+BOOL XMVector4IsNaN(FXMVECTOR V);
+BOOL XMVector4IsInfinite(FXMVECTOR V);
+
+XMVECTOR XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3);
+XMVECTOR XMVector4LengthSq(FXMVECTOR V);
+XMVECTOR XMVector4ReciprocalLengthEst(FXMVECTOR V);
+XMVECTOR XMVector4ReciprocalLength(FXMVECTOR V);
+XMVECTOR XMVector4LengthEst(FXMVECTOR V);
+XMVECTOR XMVector4Length(FXMVECTOR V);
+XMVECTOR XMVector4NormalizeEst(FXMVECTOR V);
+XMVECTOR XMVector4Normalize(FXMVECTOR V);
+XMVECTOR XMVector4ClampLength(FXMVECTOR V, FLOAT LengthMin, FLOAT LengthMax);
+XMVECTOR XMVector4ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax);
+XMVECTOR XMVector4Reflect(FXMVECTOR Incident, FXMVECTOR Normal);
+XMVECTOR XMVector4Refract(FXMVECTOR Incident, FXMVECTOR Normal, FLOAT RefractionIndex);
+XMVECTOR XMVector4RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex);
+XMVECTOR XMVector4Orthogonal(FXMVECTOR V);
+XMVECTOR XMVector4AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2);
+XMVECTOR XMVector4AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2);
+XMVECTOR XMVector4AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR XMVector4Transform(FXMVECTOR V, CXMMATRIX M);
+XMFLOAT4* XMVector4TransformStream(_Out_bytecap_x_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream,
+ _In_ size_t OutputStride,
+ _In_bytecount_x_(sizeof(XMFLOAT4)+InputStride*(VectorCount-1)) CONST XMFLOAT4* pInputStream,
+ _In_ size_t InputStride, _In_ size_t VectorCount, CXMMATRIX M);
+
+/****************************************************************************
+ *
+ * Matrix operations
+ *
+ ****************************************************************************/
+
+BOOL XMMatrixIsNaN(CXMMATRIX M);
+BOOL XMMatrixIsInfinite(CXMMATRIX M);
+BOOL XMMatrixIsIdentity(CXMMATRIX M);
+
+XMMATRIX XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2);
+XMMATRIX XMMatrixMultiplyTranspose(CXMMATRIX M1, CXMMATRIX M2);
+XMMATRIX XMMatrixTranspose(CXMMATRIX M);
+XMMATRIX XMMatrixInverse(_Out_opt_ XMVECTOR* pDeterminant, CXMMATRIX M);
+XMVECTOR XMMatrixDeterminant(CXMMATRIX M);
+BOOL XMMatrixDecompose(_Out_ XMVECTOR *outScale, _Out_ XMVECTOR *outRotQuat, _Out_ XMVECTOR *outTrans, CXMMATRIX M);
+
+XMMATRIX XMMatrixIdentity();
+XMMATRIX XMMatrixSet(FLOAT m00, FLOAT m01, FLOAT m02, FLOAT m03,
+ FLOAT m10, FLOAT m11, FLOAT m12, FLOAT m13,
+ FLOAT m20, FLOAT m21, FLOAT m22, FLOAT m23,
+ FLOAT m30, FLOAT m31, FLOAT m32, FLOAT m33);
+XMMATRIX XMMatrixTranslation(FLOAT OffsetX, FLOAT OffsetY, FLOAT OffsetZ);
+XMMATRIX XMMatrixTranslationFromVector(FXMVECTOR Offset);
+XMMATRIX XMMatrixScaling(FLOAT ScaleX, FLOAT ScaleY, FLOAT ScaleZ);
+XMMATRIX XMMatrixScalingFromVector(FXMVECTOR Scale);
+XMMATRIX XMMatrixRotationX(FLOAT Angle);
+XMMATRIX XMMatrixRotationY(FLOAT Angle);
+XMMATRIX XMMatrixRotationZ(FLOAT Angle);
+XMMATRIX XMMatrixRotationRollPitchYaw(FLOAT Pitch, FLOAT Yaw, FLOAT Roll);
+XMMATRIX XMMatrixRotationRollPitchYawFromVector(FXMVECTOR Angles);
+XMMATRIX XMMatrixRotationNormal(FXMVECTOR NormalAxis, FLOAT Angle);
+XMMATRIX XMMatrixRotationAxis(FXMVECTOR Axis, FLOAT Angle);
+XMMATRIX XMMatrixRotationQuaternion(FXMVECTOR Quaternion);
+XMMATRIX XMMatrixTransformation2D(FXMVECTOR ScalingOrigin, FLOAT ScalingOrientation, FXMVECTOR Scaling,
+ FXMVECTOR RotationOrigin, FLOAT Rotation, CXMVECTOR Translation);
+XMMATRIX XMMatrixTransformation(FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion, FXMVECTOR Scaling,
+ CXMVECTOR RotationOrigin, CXMVECTOR RotationQuaternion, CXMVECTOR Translation);
+XMMATRIX XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FLOAT Rotation, FXMVECTOR Translation);
+XMMATRIX XMMatrixAffineTransformation(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion, CXMVECTOR Translation);
+XMMATRIX XMMatrixReflect(FXMVECTOR ReflectionPlane);
+XMMATRIX XMMatrixShadow(FXMVECTOR ShadowPlane, FXMVECTOR LightPosition);
+
+XMMATRIX XMMatrixLookAtLH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection);
+XMMATRIX XMMatrixLookAtRH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection);
+XMMATRIX XMMatrixLookToLH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection);
+XMMATRIX XMMatrixLookToRH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection);
+XMMATRIX XMMatrixPerspectiveLH(FLOAT ViewWidth, FLOAT ViewHeight, FLOAT NearZ, FLOAT FarZ);
+XMMATRIX XMMatrixPerspectiveRH(FLOAT ViewWidth, FLOAT ViewHeight, FLOAT NearZ, FLOAT FarZ);
+XMMATRIX XMMatrixPerspectiveFovLH(FLOAT FovAngleY, FLOAT AspectHByW, FLOAT NearZ, FLOAT FarZ);
+XMMATRIX XMMatrixPerspectiveFovRH(FLOAT FovAngleY, FLOAT AspectHByW, FLOAT NearZ, FLOAT FarZ);
+XMMATRIX XMMatrixPerspectiveOffCenterLH(FLOAT ViewLeft, FLOAT ViewRight, FLOAT ViewBottom, FLOAT ViewTop, FLOAT NearZ, FLOAT FarZ);
+XMMATRIX XMMatrixPerspectiveOffCenterRH(FLOAT ViewLeft, FLOAT ViewRight, FLOAT ViewBottom, FLOAT ViewTop, FLOAT NearZ, FLOAT FarZ);
+XMMATRIX XMMatrixOrthographicLH(FLOAT ViewWidth, FLOAT ViewHeight, FLOAT NearZ, FLOAT FarZ);
+XMMATRIX XMMatrixOrthographicRH(FLOAT ViewWidth, FLOAT ViewHeight, FLOAT NearZ, FLOAT FarZ);
+XMMATRIX XMMatrixOrthographicOffCenterLH(FLOAT ViewLeft, FLOAT ViewRight, FLOAT ViewBottom, FLOAT ViewTop, FLOAT NearZ, FLOAT FarZ);
+XMMATRIX XMMatrixOrthographicOffCenterRH(FLOAT ViewLeft, FLOAT ViewRight, FLOAT ViewBottom, FLOAT ViewTop, FLOAT NearZ, FLOAT FarZ);
+
+
+/****************************************************************************
+ *
+ * Quaternion operations
+ *
+ ****************************************************************************/
+
+BOOL XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2);
+BOOL XMQuaternionNotEqual(FXMVECTOR Q1, FXMVECTOR Q2);
+
+BOOL XMQuaternionIsNaN(FXMVECTOR Q);
+BOOL XMQuaternionIsInfinite(FXMVECTOR Q);
+BOOL XMQuaternionIsIdentity(FXMVECTOR Q);
+
+XMVECTOR XMQuaternionDot(FXMVECTOR Q1, FXMVECTOR Q2);
+XMVECTOR XMQuaternionMultiply(FXMVECTOR Q1, FXMVECTOR Q2);
+XMVECTOR XMQuaternionLengthSq(FXMVECTOR Q);
+XMVECTOR XMQuaternionReciprocalLength(FXMVECTOR Q);
+XMVECTOR XMQuaternionLength(FXMVECTOR Q);
+XMVECTOR XMQuaternionNormalizeEst(FXMVECTOR Q);
+XMVECTOR XMQuaternionNormalize(FXMVECTOR Q);
+XMVECTOR XMQuaternionConjugate(FXMVECTOR Q);
+XMVECTOR XMQuaternionInverse(FXMVECTOR Q);
+XMVECTOR XMQuaternionLn(FXMVECTOR Q);
+XMVECTOR XMQuaternionExp(FXMVECTOR Q);
+XMVECTOR XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1, FLOAT t);
+XMVECTOR XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR T);
+XMVECTOR XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, CXMVECTOR Q3, FLOAT t);
+XMVECTOR XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, CXMVECTOR Q3, CXMVECTOR T);
+VOID XMQuaternionSquadSetup(_Out_ XMVECTOR* pA, _Out_ XMVECTOR* pB, _Out_ XMVECTOR* pC, FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, CXMVECTOR Q3);
+XMVECTOR XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, FLOAT f, FLOAT g);
+XMVECTOR XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, CXMVECTOR F, CXMVECTOR G);
+
+XMVECTOR XMQuaternionIdentity();
+XMVECTOR XMQuaternionRotationRollPitchYaw(FLOAT Pitch, FLOAT Yaw, FLOAT Roll);
+XMVECTOR XMQuaternionRotationRollPitchYawFromVector(FXMVECTOR Angles);
+XMVECTOR XMQuaternionRotationNormal(FXMVECTOR NormalAxis, FLOAT Angle);
+XMVECTOR XMQuaternionRotationAxis(FXMVECTOR Axis, FLOAT Angle);
+XMVECTOR XMQuaternionRotationMatrix(CXMMATRIX M);
+
+VOID XMQuaternionToAxisAngle(_Out_ XMVECTOR* pAxis, _Out_ FLOAT* pAngle, FXMVECTOR Q);
+
+/****************************************************************************
+ *
+ * Plane operations
+ *
+ ****************************************************************************/
+
+BOOL XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2);
+BOOL XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2, FXMVECTOR Epsilon);
+BOOL XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2);
+
+BOOL XMPlaneIsNaN(FXMVECTOR P);
+BOOL XMPlaneIsInfinite(FXMVECTOR P);
+
+XMVECTOR XMPlaneDot(FXMVECTOR P, FXMVECTOR V);
+XMVECTOR XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V);
+XMVECTOR XMPlaneDotNormal(FXMVECTOR P, FXMVECTOR V);
+XMVECTOR XMPlaneNormalizeEst(FXMVECTOR P);
+XMVECTOR XMPlaneNormalize(FXMVECTOR P);
+XMVECTOR XMPlaneIntersectLine(FXMVECTOR P, FXMVECTOR LinePoint1, FXMVECTOR LinePoint2);
+VOID XMPlaneIntersectPlane(_Out_ XMVECTOR* pLinePoint1, _Out_ XMVECTOR* pLinePoint2, FXMVECTOR P1, FXMVECTOR P2);
+XMVECTOR XMPlaneTransform(FXMVECTOR P, CXMMATRIX M);
+XMFLOAT4* XMPlaneTransformStream(_Out_bytecap_x_(sizeof(XMFLOAT4)+OutputStride*(PlaneCount-1)) XMFLOAT4* pOutputStream,
+ _In_ size_t OutputStride,
+ _In_bytecount_x_(sizeof(XMFLOAT4)+InputStride*(PlaneCount-1)) CONST XMFLOAT4* pInputStream,
+ _In_ size_t InputStride, _In_ size_t PlaneCount, CXMMATRIX M);
+
+XMVECTOR XMPlaneFromPointNormal(FXMVECTOR Point, FXMVECTOR Normal);
+XMVECTOR XMPlaneFromPoints(FXMVECTOR Point1, FXMVECTOR Point2, FXMVECTOR Point3);
+
+/****************************************************************************
+ *
+ * Color operations
+ *
+ ****************************************************************************/
+
+BOOL XMColorEqual(FXMVECTOR C1, FXMVECTOR C2);
+BOOL XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2);
+BOOL XMColorGreater(FXMVECTOR C1, FXMVECTOR C2);
+BOOL XMColorGreaterOrEqual(FXMVECTOR C1, FXMVECTOR C2);
+BOOL XMColorLess(FXMVECTOR C1, FXMVECTOR C2);
+BOOL XMColorLessOrEqual(FXMVECTOR C1, FXMVECTOR C2);
+
+BOOL XMColorIsNaN(FXMVECTOR C);
+BOOL XMColorIsInfinite(FXMVECTOR C);
+
+XMVECTOR XMColorNegative(FXMVECTOR C);
+XMVECTOR XMColorModulate(FXMVECTOR C1, FXMVECTOR C2);
+XMVECTOR XMColorAdjustSaturation(FXMVECTOR C, FLOAT Saturation);
+XMVECTOR XMColorAdjustContrast(FXMVECTOR C, FLOAT Contrast);
+
+/****************************************************************************
+ *
+ * Miscellaneous operations
+ *
+ ****************************************************************************/
+
+BOOL XMVerifyCPUSupport();
+
+VOID XMAssert(_In_z_ CONST CHAR* pExpression, _In_z_ CONST CHAR* pFileName, UINT LineNumber);
+
+XMVECTOR XMFresnelTerm(FXMVECTOR CosIncidentAngle, FXMVECTOR RefractionIndex);
+
+BOOL XMScalarNearEqual(FLOAT S1, FLOAT S2, FLOAT Epsilon);
+FLOAT XMScalarModAngle(FLOAT Value);
+FLOAT XMScalarSin(FLOAT Value);
+FLOAT XMScalarCos(FLOAT Value);
+VOID XMScalarSinCos(_Out_ FLOAT* pSin, _Out_ FLOAT* pCos, FLOAT Value);
+FLOAT XMScalarASin(FLOAT Value);
+FLOAT XMScalarACos(FLOAT Value);
+FLOAT XMScalarSinEst(FLOAT Value);
+FLOAT XMScalarCosEst(FLOAT Value);
+VOID XMScalarSinCosEst(_Out_ FLOAT* pSin, _Out_ FLOAT* pCos, FLOAT Value);
+FLOAT XMScalarASinEst(FLOAT Value);
+FLOAT XMScalarACosEst(FLOAT Value);
+
+/****************************************************************************
+ *
+ * Templates
+ *
+ ****************************************************************************/
+
+#if defined(__cplusplus)
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+
+// PermuteHelper internal template (SSE only)
+namespace XNAMathInternal
+{
+ // Slow path fallback for permutes that do not map to a single SSE shuffle opcode.
+ template<UINT Shuffle, bool WhichX, bool WhichY, bool WhichZ, bool WhichW> struct PermuteHelper
+ {
+ static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2)
+ {
+ static const XMVECTORU32 selectMask =
+ {
+ WhichX ? 0xFFFFFFFF : 0,
+ WhichY ? 0xFFFFFFFF : 0,
+ WhichZ ? 0xFFFFFFFF : 0,
+ WhichW ? 0xFFFFFFFF : 0,
+ };
+
+ XMVECTOR shuffled1 = _mm_shuffle_ps(v1, v1, Shuffle);
+ XMVECTOR shuffled2 = _mm_shuffle_ps(v2, v2, Shuffle);
+
+ XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
+ XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);
+
+ return _mm_or_ps(masked1, masked2);
+ }
+ };
+
+ // Fast path for permutes that only read from the first vector.
+ template<UINT Shuffle> struct PermuteHelper<Shuffle, false, false, false, false>
+ {
+ static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_shuffle_ps(v1, v1, Shuffle); }
+ };
+
+ // Fast path for permutes that only read from the second vector.
+ template<UINT Shuffle> struct PermuteHelper<Shuffle, true, true, true, true>
+ {
+ static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_shuffle_ps(v2, v2, Shuffle); }
+ };
+
+ // Fast path for permutes that read XY from the first vector, ZW from the second.
+ template<UINT Shuffle> struct PermuteHelper<Shuffle, false, false, true, true>
+ {
+ static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); }
+ };
+
+ // Fast path for permutes that read XY from the second vector, ZW from the first.
+ template<UINT Shuffle> struct PermuteHelper<Shuffle, true, true, false, false>
+ {
+ static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); }
+ };
+};
+
+#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
+
+// General permute template
+template<UINT PermuteX, UINT PermuteY, UINT PermuteZ, UINT PermuteW>
+ inline XMVECTOR XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2)
+{
+#if defined(_MSC_VER) && (_MSC_VER >= 1600)
+ static_assert(PermuteX <= 7, "PermuteX template parameter out of range");
+ static_assert(PermuteY <= 7, "PermuteY template parameter out of range");
+ static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range");
+ static_assert(PermuteW <= 7, "PermuteW template parameter out of range");
+#else
+ XMASSERT(PermuteX <= 7);
+ XMASSERT(PermuteY <= 7);
+ XMASSERT(PermuteZ <= 7);
+ XMASSERT(PermuteW <= 7);
+#endif
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+ const UINT Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3);
+
+ const bool WhichX = PermuteX > 3;
+ const bool WhichY = PermuteY > 3;
+ const bool WhichZ = PermuteZ > 3;
+ const bool WhichW = PermuteW > 3;
+
+ return XNAMathInternal::PermuteHelper<Shuffle, WhichX, WhichY, WhichZ, WhichW>::Permute(V1, V2);
+#else
+
+ XMVECTOR c = XMVectorPermuteControl( PermuteX, PermuteY, PermuteZ, PermuteW );
+ return XMVectorPermute( V1, V2, c );
+
+#endif
+}
+
+// Special-case permute templates
+template<> inline XMVECTOR XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
+template<> inline XMVECTOR XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
+
+//------------------------------------------------------------------------------
+
+// General swizzle template
+template<UINT SwizzleX, UINT SwizzleY, UINT SwizzleZ, UINT SwizzleW>
+ inline XMVECTOR XMVectorSwizzle(FXMVECTOR V)
+{
+#if defined(_MSC_VER) && (_MSC_VER >= 1600)
+ static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
+ static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
+ static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
+ static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
+#else
+ XMASSERT(SwizzleX <= 3);
+ XMASSERT(SwizzleY <= 3);
+ XMASSERT(SwizzleZ <= 3);
+ XMASSERT(SwizzleW <= 3);
+#endif
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+ return _mm_shuffle_ps( V, V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) );
+#elif defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+ return __vpermwi(V, ((SwizzleX & 3) << 6) | ((SwizzleY & 3) << 4) | ((SwizzleZ & 3) << 2) | (SwizzleW & 3) );
+#else
+
+ return XMVectorSwizzle( V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW );
+
+#endif
+}
+
+// Specialized swizzles
+template<> inline XMVECTOR XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
+
+//------------------------------------------------------------------------------
+
+template<UINT Elements>
+ inline XMVECTOR XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2)
+{
+#if defined(_MSC_VER) && (_MSC_VER >= 1600)
+ static_assert( Elements < 4, "Elements template parameter out of range" );
+#else
+ XMASSERT( Elements < 4 );
+#endif
+
+#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#else
+ return XMVectorPermute<Elements, (Elements + 1), (Elements + 2), (Elements + 3)>(V1, V2);
+#endif
+}
+
+template<UINT Elements>
+ inline XMVECTOR XMVectorRotateLeft(FXMVECTOR V)
+{
+#if defined(_MSC_VER) && (_MSC_VER >= 1600)
+ static_assert( Elements < 4, "Elements template parameter out of range" );
+#else
+ XMASSERT( Elements < 4 );
+#endif
+
+#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#else
+ return XMVectorSwizzle<Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3>(V);
+#endif
+}
+
+template<UINT Elements>
+ inline XMVECTOR XMVectorRotateRight(FXMVECTOR V)
+{
+#if defined(_MSC_VER) && (_MSC_VER >= 1600)
+ static_assert( Elements < 4, "Elements template parameter out of range" );
+#else
+ XMASSERT( Elements < 4 );
+#endif
+
+#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#else
+ return XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V);
+#endif
+}
+
+template<UINT VSLeftRotateElements, UINT Select0, UINT Select1, UINT Select2, UINT Select3>
+ inline XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS)
+{
+#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#else
+ XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1);
+ return XMVectorSelect( VD, XMVectorRotateLeft<VSLeftRotateElements>(VS), Control );
+#endif
+}
+
+#endif // __cplusplus
+
+/****************************************************************************
+ *
+ * Globals
+ *
+ ****************************************************************************/
+
+// The purpose of the following global constants is to prevent redundant
+// reloading of the constants when they are referenced by more than one
+// separate inline math routine called within the same function. Declaring
+// a constant locally within a routine is sufficient to prevent redundant
+// reloads of that constant when that single routine is called multiple
+// times in a function, but if the constant is used (and declared) in a
+// separate math routine it would be reloaded.
+
+#define XMGLOBALCONST extern CONST __declspec(selectany)
+
+XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients0 = {1.0f, -0.166666667f, 8.333333333e-3f, -1.984126984e-4f};
+XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients1 = {2.755731922e-6f, -2.505210839e-8f, 1.605904384e-10f, -7.647163732e-13f};
+XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients2 = {2.811457254e-15f, -8.220635247e-18f, 1.957294106e-20f, -3.868170171e-23f};
+XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients0 = {1.0f, -0.5f, 4.166666667e-2f, -1.388888889e-3f};
+XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients1 = {2.480158730e-5f, -2.755731922e-7f, 2.087675699e-9f, -1.147074560e-11f};
+XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients2 = {4.779477332e-14f, -1.561920697e-16f, 4.110317623e-19f, -8.896791392e-22f};
+XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients0 = {1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f};
+XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients1 = {2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f};
+XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients2 = {5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f};
+XMGLOBALCONST XMVECTORF32 g_XMASinCoefficients0 = {-0.05806367563904f, -0.41861972469416f, 0.22480114791621f, 2.17337241360606f};
+XMGLOBALCONST XMVECTORF32 g_XMASinCoefficients1 = {0.61657275907170f, 4.29696498283455f, -1.18942822255452f, -6.53784832094831f};
+XMGLOBALCONST XMVECTORF32 g_XMASinCoefficients2 = {-1.36926553863413f, -4.48179294237210f, 1.41810672941833f, 5.48179257935713f};
+XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients0 = {1.0f, 0.333333334f, 0.2f, 0.142857143f};
+XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients1 = {1.111111111e-1f, 9.090909091e-2f, 7.692307692e-2f, 6.666666667e-2f};
+XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients2 = {5.882352941e-2f, 5.263157895e-2f, 4.761904762e-2f, 4.347826087e-2f};
+XMGLOBALCONST XMVECTORF32 g_XMSinEstCoefficients = {1.0f, -1.66521856991541e-1f, 8.199913018755e-3f, -1.61475937228e-4f};
+XMGLOBALCONST XMVECTORF32 g_XMCosEstCoefficients = {1.0f, -4.95348008918096e-1f, 3.878259962881e-2f, -9.24587976263e-4f};
+XMGLOBALCONST XMVECTORF32 g_XMTanEstCoefficients = {2.484f, -1.954923183e-1f, 2.467401101f, XM_1DIVPI};
+XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients = {7.689891418951e-1f, 1.104742493348f, 8.661844266006e-1f, XM_PIDIV2};
+XMGLOBALCONST XMVECTORF32 g_XMASinEstCoefficients = {-1.36178272886711f, 2.37949493464538f, -8.08228565650486e-1f, 2.78440142746736e-1f};
+XMGLOBALCONST XMVECTORF32 g_XMASinEstConstants = {1.00000011921f, XM_PIDIV2, 0.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMPiConstants0 = {XM_PI, XM_2PI, XM_1DIVPI, XM_1DIV2PI};
+XMGLOBALCONST XMVECTORF32 g_XMIdentityR0 = {1.0f, 0.0f, 0.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMIdentityR1 = {0.0f, 1.0f, 0.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMIdentityR2 = {0.0f, 0.0f, 1.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMIdentityR3 = {0.0f, 0.0f, 0.0f, 1.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0 = {-1.0f,0.0f, 0.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1 = {0.0f,-1.0f, 0.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2 = {0.0f, 0.0f,-1.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3 = {0.0f, 0.0f, 0.0f,-1.0f};
+XMGLOBALCONST XMVECTORI32 g_XMNegativeZero = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+XMGLOBALCONST XMVECTORI32 g_XMNegate3 = {0x80000000, 0x80000000, 0x80000000, 0x00000000};
+XMGLOBALCONST XMVECTORI32 g_XMMask3 = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000};
+XMGLOBALCONST XMVECTORI32 g_XMMaskX = {0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000};
+XMGLOBALCONST XMVECTORI32 g_XMMaskY = {0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000};
+XMGLOBALCONST XMVECTORI32 g_XMMaskZ = {0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000};
+XMGLOBALCONST XMVECTORI32 g_XMMaskW = {0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF};
+XMGLOBALCONST XMVECTORF32 g_XMOne = { 1.0f, 1.0f, 1.0f, 1.0f};
+XMGLOBALCONST XMVECTORF32 g_XMOne3 = { 1.0f, 1.0f, 1.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMZero = { 0.0f, 0.0f, 0.0f, 0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegativeOne = {-1.0f,-1.0f,-1.0f,-1.0f};
+XMGLOBALCONST XMVECTORF32 g_XMOneHalf = { 0.5f, 0.5f, 0.5f, 0.5f};
+XMGLOBALCONST XMVECTORF32 g_XMNegativeOneHalf = {-0.5f,-0.5f,-0.5f,-0.5f};
+XMGLOBALCONST XMVECTORF32 g_XMNegativeTwoPi = {-XM_2PI, -XM_2PI, -XM_2PI, -XM_2PI};
+XMGLOBALCONST XMVECTORF32 g_XMNegativePi = {-XM_PI, -XM_PI, -XM_PI, -XM_PI};
+XMGLOBALCONST XMVECTORF32 g_XMHalfPi = {XM_PIDIV2, XM_PIDIV2, XM_PIDIV2, XM_PIDIV2};
+XMGLOBALCONST XMVECTORF32 g_XMPi = {XM_PI, XM_PI, XM_PI, XM_PI};
+XMGLOBALCONST XMVECTORF32 g_XMReciprocalPi = {XM_1DIVPI, XM_1DIVPI, XM_1DIVPI, XM_1DIVPI};
+XMGLOBALCONST XMVECTORF32 g_XMTwoPi = {XM_2PI, XM_2PI, XM_2PI, XM_2PI};
+XMGLOBALCONST XMVECTORF32 g_XMReciprocalTwoPi = {XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI};
+XMGLOBALCONST XMVECTORF32 g_XMEpsilon = {1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f};
+XMGLOBALCONST XMVECTORI32 g_XMInfinity = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
+XMGLOBALCONST XMVECTORI32 g_XMQNaN = {0x7FC00000, 0x7FC00000, 0x7FC00000, 0x7FC00000};
+XMGLOBALCONST XMVECTORI32 g_XMQNaNTest = {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
+XMGLOBALCONST XMVECTORI32 g_XMAbsMask = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+XMGLOBALCONST XMVECTORI32 g_XMFltMin = {0x00800000, 0x00800000, 0x00800000, 0x00800000};
+XMGLOBALCONST XMVECTORI32 g_XMFltMax = {0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF};
+XMGLOBALCONST XMVECTORI32 g_XMNegOneMask = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
+XMGLOBALCONST XMVECTORI32 g_XMMaskA8R8G8B8 = {0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000};
+XMGLOBALCONST XMVECTORI32 g_XMFlipA8R8G8B8 = {0x00000000, 0x00000000, 0x00000000, 0x80000000};
+XMGLOBALCONST XMVECTORF32 g_XMFixAA8R8G8B8 = {0.0f,0.0f,0.0f,(float)(0x80000000U)};
+XMGLOBALCONST XMVECTORF32 g_XMNormalizeA8R8G8B8 = {1.0f/(255.0f*(float)(0x10000)),1.0f/(255.0f*(float)(0x100)),1.0f/255.0f,1.0f/(255.0f*(float)(0x1000000))};
+XMGLOBALCONST XMVECTORI32 g_XMMaskA2B10G10R10 = {0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000};
+XMGLOBALCONST XMVECTORI32 g_XMFlipA2B10G10R10 = {0x00000200, 0x00080000, 0x20000000, 0x80000000};
+XMGLOBALCONST XMVECTORF32 g_XMFixAA2B10G10R10 = {-512.0f,-512.0f*(float)(0x400),-512.0f*(float)(0x100000),(float)(0x80000000U)};
+XMGLOBALCONST XMVECTORF32 g_XMNormalizeA2B10G10R10 = {1.0f/511.0f,1.0f/(511.0f*(float)(0x400)),1.0f/(511.0f*(float)(0x100000)),1.0f/(3.0f*(float)(0x40000000))};
+XMGLOBALCONST XMVECTORI32 g_XMMaskX16Y16 = {0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000};
+XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16 = {0x00008000, 0x00000000, 0x00000000, 0x00000000};
+XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16 = {-32768.0f,0.0f,0.0f,0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16 = {1.0f/32767.0f,1.0f/(32767.0f*65536.0f),0.0f,0.0f};
+XMGLOBALCONST XMVECTORI32 g_XMMaskX16Y16Z16W16 = {0x0000FFFF, 0x0000FFFF, 0xFFFF0000, 0xFFFF0000};
+XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16Z16W16 = {0x00008000, 0x00008000, 0x00000000, 0x00000000};
+XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16Z16W16 = {-32768.0f,-32768.0f,0.0f,0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16Z16W16 = {1.0f/32767.0f,1.0f/32767.0f,1.0f/(32767.0f*65536.0f),1.0f/(32767.0f*65536.0f)};
+XMGLOBALCONST XMVECTORF32 g_XMNoFraction = {8388608.0f,8388608.0f,8388608.0f,8388608.0f};
+XMGLOBALCONST XMVECTORI32 g_XMMaskByte = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF};
+XMGLOBALCONST XMVECTORF32 g_XMNegateX = {-1.0f, 1.0f, 1.0f, 1.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegateY = { 1.0f,-1.0f, 1.0f, 1.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegateZ = { 1.0f, 1.0f,-1.0f, 1.0f};
+XMGLOBALCONST XMVECTORF32 g_XMNegateW = { 1.0f, 1.0f, 1.0f,-1.0f};
+XMGLOBALCONST XMVECTORI32 g_XMSelect0101 = {XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1};
+XMGLOBALCONST XMVECTORI32 g_XMSelect1010 = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0};
+XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD};
+XMGLOBALCONST XMVECTORI32 g_XMSelect1000 = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0};
+XMGLOBALCONST XMVECTORI32 g_XMSelect1100 = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0};
+XMGLOBALCONST XMVECTORI32 g_XMSelect1110 = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0};
+XMGLOBALCONST XMVECTORI32 g_XMSwizzleXYXY = {XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0Y};
+XMGLOBALCONST XMVECTORI32 g_XMSwizzleXYZX = {XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0X};
+XMGLOBALCONST XMVECTORI32 g_XMSwizzleYXZW = {XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_0W};
+XMGLOBALCONST XMVECTORI32 g_XMSwizzleYZXW = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0W};
+XMGLOBALCONST XMVECTORI32 g_XMSwizzleZXYW = {XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W};
+XMGLOBALCONST XMVECTORI32 g_XMPermute0X0Y1X1Y = {XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1Y};
+XMGLOBALCONST XMVECTORI32 g_XMPermute0Z0W1Z1W = {XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_1W};
+XMGLOBALCONST XMVECTORF32 g_XMFixupY16 = {1.0f,1.0f/65536.0f,0.0f,0.0f};
+XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16 = {1.0f,1.0f,1.0f/65536.0f,1.0f/65536.0f};
+XMGLOBALCONST XMVECTORI32 g_XMFlipY = {0,0x80000000,0,0};
+XMGLOBALCONST XMVECTORI32 g_XMFlipZ = {0,0,0x80000000,0};
+XMGLOBALCONST XMVECTORI32 g_XMFlipW = {0,0,0,0x80000000};
+XMGLOBALCONST XMVECTORI32 g_XMFlipYZ = {0,0x80000000,0x80000000,0};
+XMGLOBALCONST XMVECTORI32 g_XMFlipZW = {0,0,0x80000000,0x80000000};
+XMGLOBALCONST XMVECTORI32 g_XMFlipYW = {0,0x80000000,0,0x80000000};
+XMGLOBALCONST XMVECTORI32 g_XMMaskHenD3 = {0x7FF,0x7ff<<11,0x3FF<<22,0};
+XMGLOBALCONST XMVECTORI32 g_XMMaskDHen3 = {0x3FF,0x7ff<<10,0x7FF<<21,0};
+XMGLOBALCONST XMVECTORF32 g_XMAddUHenD3 = {0,0,32768.0f*65536.0f,0};
+XMGLOBALCONST XMVECTORF32 g_XMAddHenD3 = {-1024.0f,-1024.0f*2048.0f,0,0};
+XMGLOBALCONST XMVECTORF32 g_XMAddDHen3 = {-512.0f,-1024.0f*1024.0f,0,0};
+XMGLOBALCONST XMVECTORF32 g_XMMulHenD3 = {1.0f,1.0f/2048.0f,1.0f/(2048.0f*2048.0f),0};
+XMGLOBALCONST XMVECTORF32 g_XMMulDHen3 = {1.0f,1.0f/1024.0f,1.0f/(1024.0f*2048.0f),0};
+XMGLOBALCONST XMVECTORI32 g_XMXorHenD3 = {0x400,0x400<<11,0,0};
+XMGLOBALCONST XMVECTORI32 g_XMXorDHen3 = {0x200,0x400<<10,0,0};
+XMGLOBALCONST XMVECTORI32 g_XMMaskIco4 = {0xFFFFF,0xFFFFF000,0xFFFFF,0xF0000000};
+XMGLOBALCONST XMVECTORI32 g_XMXorXIco4 = {0x80000,0,0x80000,0x80000000};
+XMGLOBALCONST XMVECTORI32 g_XMXorIco4 = {0x80000,0,0x80000,0};
+XMGLOBALCONST XMVECTORF32 g_XMAddXIco4 = {-8.0f*65536.0f,0,-8.0f*65536.0f,32768.0f*65536.0f};
+XMGLOBALCONST XMVECTORF32 g_XMAddUIco4 = {0,32768.0f*65536.0f,0,32768.0f*65536.0f};
+XMGLOBALCONST XMVECTORF32 g_XMAddIco4 = {-8.0f*65536.0f,0,-8.0f*65536.0f,0};
+XMGLOBALCONST XMVECTORF32 g_XMMulIco4 = {1.0f,1.0f/4096.0f,1.0f,1.0f/(4096.0f*65536.0f)};
+XMGLOBALCONST XMVECTORI32 g_XMMaskDec4 = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
+XMGLOBALCONST XMVECTORI32 g_XMXorDec4 = {0x200,0x200<<10,0x200<<20,0};
+XMGLOBALCONST XMVECTORF32 g_XMAddUDec4 = {0,0,0,32768.0f*65536.0f};
+XMGLOBALCONST XMVECTORF32 g_XMAddDec4 = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,0};
+XMGLOBALCONST XMVECTORF32 g_XMMulDec4 = {1.0f,1.0f/1024.0f,1.0f/(1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)};
+XMGLOBALCONST XMVECTORI32 g_XMMaskByte4 = {0xFF,0xFF00,0xFF0000,0xFF000000};
+XMGLOBALCONST XMVECTORI32 g_XMXorByte4 = {0x80,0x8000,0x800000,0x00000000};
+XMGLOBALCONST XMVECTORF32 g_XMAddByte4 = {-128.0f,-128.0f*256.0f,-128.0f*65536.0f,0};
+XMGLOBALCONST XMVECTORF32 g_XMFixUnsigned = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f};
+XMGLOBALCONST XMVECTORF32 g_XMMaxInt = {65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f};
+XMGLOBALCONST XMVECTORF32 g_XMMaxUInt = {65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f};
+XMGLOBALCONST XMVECTORF32 g_XMUnsignedFix = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f};
+
+
+/****************************************************************************
+ *
+ * Implementation
+ *
+ ****************************************************************************/
+
+#pragma warning(push)
+#pragma warning(disable:4068 4214 4204 4365 4616 6001)
+
+#pragma prefast(push)
+#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
+
+#if !defined(__cplusplus) && !defined(_XBOX) && defined(_XM_ISVS2005_)
+
+/* Work around VC 2005 bug where math.h defines logf with a semicolon at the end.
+ * Note this is fixed as of Visual Studio 2005 Service Pack 1
+ */
+
+#undef logf
+#define logf(x) ((float)log((double)(x)))
+
+#endif // !defined(__cplusplus) && !defined(_XBOX) && defined(_XM_ISVS2005_)
+
+
+//------------------------------------------------------------------------------
+
+#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_VMX128_INTRINSICS_)
+#else
+
+XMFINLINE XMVECTOR XMVectorSetBinaryConstant(UINT C0, UINT C1, UINT C2, UINT C3)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTORU32 vResult;
+ vResult.u[0] = (0-(C0&1)) & 0x3F800000;
+ vResult.u[1] = (0-(C1&1)) & 0x3F800000;
+ vResult.u[2] = (0-(C2&1)) & 0x3F800000;
+ vResult.u[3] = (0-(C3&1)) & 0x3F800000;
+ return vResult.v;
+#else // XM_SSE_INTRINSICS_
+ static const XMVECTORU32 g_vMask1 = {1,1,1,1};
+ // Move the parms to a vector
+ __m128i vTemp = _mm_set_epi32(C3,C2,C1,C0);
+ // Mask off the low bits
+ vTemp = _mm_and_si128(vTemp,g_vMask1);
+ // 0xFFFFFFFF on true bits
+ vTemp = _mm_cmpeq_epi32(vTemp,g_vMask1);
+ // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f
+ vTemp = _mm_and_si128(vTemp,g_XMOne);
+ return reinterpret_cast<const __m128 *>(&vTemp)[0];
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorSplatConstant(INT IntConstant, UINT DivExponent)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMASSERT( IntConstant >= -16 && IntConstant <= 15 );
+ XMASSERT(DivExponent<32);
+ {
+ XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant };
+ return XMConvertVectorIntToFloat( V.v, DivExponent);
+ }
+#else // XM_SSE_INTRINSICS_
+ XMASSERT( IntConstant >= -16 && IntConstant <= 15 );
+ XMASSERT(DivExponent<32);
+ // Splat the int
+ __m128i vScale = _mm_set1_epi32(IntConstant);
+ // Convert to a float
+ XMVECTOR vResult = _mm_cvtepi32_ps(vScale);
+ // Convert DivExponent into 1.0f/(1<<DivExponent)
+ UINT uScale = 0x3F800000U - (DivExponent << 23);
+ // Splat the scalar value (It's really a float)
+ vScale = _mm_set1_epi32(uScale);
+ // Multiply by the reciprocal (Perform a right shift by DivExponent)
+ vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&vScale)[0]);
+ return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorSplatConstantInt(INT IntConstant)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMASSERT( IntConstant >= -16 && IntConstant <= 15 );
+ {
+ XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant };
+ return V.v;
+ }
+#else // XM_SSE_INTRINSICS_
+ XMASSERT( IntConstant >= -16 && IntConstant <= 15 );
+ __m128i V = _mm_set1_epi32( IntConstant );
+ return reinterpret_cast<__m128 *>(&V)[0];
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, UINT Elements)
+{
+ return XMVectorPermute(V1, V2, XMVectorPermuteControl((Elements), ((Elements) + 1), ((Elements) + 2), ((Elements) + 3)));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorRotateLeft(FXMVECTOR V, UINT Elements)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMASSERT( Elements < 4 );
+ {
+ XMVECTORF32 vResult = { V.vector4_f32[Elements & 3], V.vector4_f32[(Elements + 1) & 3],
+ V.vector4_f32[(Elements + 2) & 3], V.vector4_f32[(Elements + 3) & 3] };
+ return vResult.v;
+ }
+#else // XM_SSE_INTRINSICS_
+ FLOAT fx = XMVectorGetByIndex(V,(Elements) & 3);
+ FLOAT fy = XMVectorGetByIndex(V,((Elements) + 1) & 3);
+ FLOAT fz = XMVectorGetByIndex(V,((Elements) + 2) & 3);
+ FLOAT fw = XMVectorGetByIndex(V,((Elements) + 3) & 3);
+ return _mm_set_ps( fw, fz, fy, fx );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorRotateRight(FXMVECTOR V, UINT Elements)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMASSERT( Elements < 4 );
+ {
+ XMVECTORF32 vResult = { V.vector4_f32[(4 - (Elements)) & 3], V.vector4_f32[(5 - (Elements)) & 3],
+ V.vector4_f32[(6 - (Elements)) & 3], V.vector4_f32[(7 - (Elements)) & 3] };
+ return vResult.v;
+ }
+#else // XM_SSE_INTRINSICS_
+ FLOAT fx = XMVectorGetByIndex(V,(4 - (Elements)) & 3);
+ FLOAT fy = XMVectorGetByIndex(V,(5 - (Elements)) & 3);
+ FLOAT fz = XMVectorGetByIndex(V,(6 - (Elements)) & 3);
+ FLOAT fw = XMVectorGetByIndex(V,(7 - (Elements)) & 3);
+ return _mm_set_ps( fw, fz, fy, fx );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorSwizzle(FXMVECTOR V, UINT E0, UINT E1, UINT E2, UINT E3)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMASSERT( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
+ {
+ XMVECTORF32 vResult = { V.vector4_f32[E0], V.vector4_f32[E1], V.vector4_f32[E2], V.vector4_f32[E3] };
+ return vResult.v;
+ }
+#else // XM_SSE_INTRINSICS_
+ FLOAT fx = XMVectorGetByIndex(V,E0);
+ FLOAT fy = XMVectorGetByIndex(V,E1);
+ FLOAT fz = XMVectorGetByIndex(V,E2);
+ FLOAT fw = XMVectorGetByIndex(V,E3);
+ return _mm_set_ps( fw, fz, fy, fx );
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, UINT VSLeftRotateElements,
+ UINT Select0, UINT Select1, UINT Select2, UINT Select3)
+{
+ XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1);
+ return XMVectorSelect( VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control );
+}
+
+#endif
+
+//------------------------------------------------------------------------------
+
+#include "xnamathconvert.inl"
+#include "xnamathvector.inl"
+#include "xnamathmatrix.inl"
+#include "xnamathmisc.inl"
+
+#pragma prefast(pop)
+#pragma warning(pop)
+
+#endif // __XNAMATH_H__
+
diff --git a/thirdparty/directxtex/XNAMath/xnamathconvert.inl b/thirdparty/directxtex/XNAMath/xnamathconvert.inl
new file mode 100644
index 00000000..63ed20df
--- /dev/null
+++ b/thirdparty/directxtex/XNAMath/xnamathconvert.inl
@@ -0,0 +1,6383 @@
+/************************************************************************
+* *
+* xnamathconvert.inl -- SIMD C++ Math library for Windows and Xbox 360 *
+* Conversion, loading, and storing functions *
+* *
+* Copyright (c) Microsoft Corp. All rights reserved. *
+* *
+************************************************************************/
+
+#if defined(_MSC_VER) && (_MSC_VER > 1000)
+#pragma once
+#endif
+
+#ifndef __XNAMATHCONVERT_INL__
+#define __XNAMATHCONVERT_INL__
+
+#define XM_PACK_FACTOR (FLOAT)(1 << 22)
+#define XM_UNPACK_FACTOR_UNSIGNED (FLOAT)(1 << 23)
+#define XM_UNPACK_FACTOR_SIGNED XM_PACK_FACTOR
+
+#define XM_UNPACK_UNSIGNEDN_OFFSET(BitsX, BitsY, BitsZ, BitsW) \
+ {-XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsX)) - 1), \
+ -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsY)) - 1), \
+ -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsZ)) - 1), \
+ -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsW)) - 1)}
+
+#define XM_UNPACK_UNSIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
+ {XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsX)) - 1), \
+ XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsY)) - 1), \
+ XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsZ)) - 1), \
+ XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsW)) - 1)}
+
+#define XM_UNPACK_SIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
+ {-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsX) - 1)) - 1), \
+ -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsY) - 1)) - 1), \
+ -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsZ) - 1)) - 1), \
+ -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsW) - 1)) - 1)}
+
+//#define XM_UNPACK_SIGNEDN_OFFSET(BitsX, BitsY, BitsZ, BitsW) \
+// {-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsX) - 1)) - 1) * 3.0f, \
+// -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsY) - 1)) - 1) * 3.0f, \
+// -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsZ) - 1)) - 1) * 3.0f, \
+// -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsW) - 1)) - 1) * 3.0f}
+
+#define XM_PACK_UNSIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
+ {-(FLOAT)((1 << (BitsX)) - 1) / XM_PACK_FACTOR, \
+ -(FLOAT)((1 << (BitsY)) - 1) / XM_PACK_FACTOR, \
+ -(FLOAT)((1 << (BitsZ)) - 1) / XM_PACK_FACTOR, \
+ -(FLOAT)((1 << (BitsW)) - 1) / XM_PACK_FACTOR}
+
+#define XM_PACK_SIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
+ {-(FLOAT)((1 << ((BitsX) - 1)) - 1) / XM_PACK_FACTOR, \
+ -(FLOAT)((1 << ((BitsY) - 1)) - 1) / XM_PACK_FACTOR, \
+ -(FLOAT)((1 << ((BitsZ) - 1)) - 1) / XM_PACK_FACTOR, \
+ -(FLOAT)((1 << ((BitsW) - 1)) - 1) / XM_PACK_FACTOR}
+
+#define XM_PACK_OFFSET XMVectorSplatConstant(3, 0)
+//#define XM_UNPACK_OFFSET XM_PACK_OFFSET
+
+/****************************************************************************
+ *
+ * Data conversion
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE FLOAT XMConvertHalfToFloat
+(
+ HALF Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
+
+ UINT Mantissa;
+ UINT Exponent;
+ UINT Result;
+
+ Mantissa = (UINT)(Value & 0x03FF);
+
+ if ((Value & 0x7C00) != 0) // The value is normalized
+ {
+ Exponent = (UINT)((Value >> 10) & 0x1F);
+ }
+ else if (Mantissa != 0) // The value is denormalized
+ {
+ // Normalize the value in the resulting float
+ Exponent = 1;
+
+ do
+ {
+ Exponent--;
+ Mantissa <<= 1;
+ } while ((Mantissa & 0x0400) == 0);
+
+ Mantissa &= 0x03FF;
+ }
+ else // The value is zero
+ {
+ Exponent = (UINT)-112;
+ }
+
+ Result = ((Value & 0x8000) << 16) | // Sign
+ ((Exponent + 112) << 23) | // Exponent
+ (Mantissa << 13); // Mantissa
+
+ return *(FLOAT*)&Result;
+
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE FLOAT* XMConvertHalfToFloatStream
+(
+ FLOAT* pOutputStream,
+ size_t OutputStride,
+ CONST HALF* pInputStream,
+ size_t InputStride,
+ size_t HalfCount
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
+
+ size_t i;
+ CONST BYTE* pHalf = (CONST BYTE*)pInputStream;
+ BYTE* pFloat = (BYTE*)pOutputStream;
+
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+
+ for (i = 0; i < HalfCount; i++)
+ {
+ *(FLOAT*)pFloat = XMConvertHalfToFloat(*(const HALF*)pHalf);
+ pHalf += InputStride;
+ pFloat += OutputStride;
+ }
+
+ return pOutputStream;
+
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE HALF XMConvertFloatToHalf
+(
+ FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
+ UINT Result;
+
+ UINT IValue = ((UINT *)(&Value))[0];
+ UINT Sign = (IValue & 0x80000000U) >> 16U;
+ IValue = IValue & 0x7FFFFFFFU; // Hack off the sign
+
+ if (IValue > 0x47FFEFFFU)
+ {
+ // The number is too large to be represented as a half. Saturate to infinity.
+ Result = 0x7FFFU;
+ }
+ else
+ {
+ if (IValue < 0x38800000U)
+ {
+ // The number is too small to be represented as a normalized half.
+ // Convert it to a denormalized value.
+ UINT Shift = 113U - (IValue >> 23U);
+ IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
+ }
+ else
+ {
+ // Rebias the exponent to represent the value as a normalized half.
+ IValue += 0xC8000000U;
+ }
+
+ Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU;
+ }
+ return (HALF)(Result|Sign);
+
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE HALF* XMConvertFloatToHalfStream
+(
+ HALF* pOutputStream,
+ size_t OutputStride,
+ CONST FLOAT* pInputStream,
+ size_t InputStride,
+ size_t FloatCount
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
+
+ size_t i;
+ BYTE* pFloat = (BYTE*)pInputStream;
+ BYTE* pHalf = (BYTE*)pOutputStream;
+
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+
+ for (i = 0; i < FloatCount; i++)
+ {
+ *(HALF*)pHalf = XMConvertFloatToHalf(*(FLOAT*)pFloat);
+ pFloat += InputStride;
+ pHalf += OutputStride;
+ }
+ return pOutputStream;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
+// For VMX128, these routines are all defines in the main header
+
+#pragma warning(push)
+#pragma warning(disable:4701) // Prevent warnings about 'Result' potentially being used without having been initialized
+
+XMINLINE XMVECTOR XMConvertVectorIntToFloat
+(
+ FXMVECTOR VInt,
+ UINT DivExponent
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT ElementIndex;
+ FLOAT fScale;
+ XMVECTOR Result;
+ XMASSERT(DivExponent<32);
+ fScale = 1.0f / (FLOAT)(1U << DivExponent);
+ ElementIndex = 0;
+ do {
+ INT iTemp = (INT)VInt.vector4_u32[ElementIndex];
+ Result.vector4_f32[ElementIndex] = ((FLOAT)iTemp) * fScale;
+ } while (++ElementIndex<4);
+ return Result;
+#else // _XM_SSE_INTRINSICS_
+ XMASSERT(DivExponent<32);
+ // Convert to floats
+ XMVECTOR vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&VInt)[0]);
+ // Convert DivExponent into 1.0f/(1<<DivExponent)
+ UINT uScale = 0x3F800000U - (DivExponent << 23);
+ // Splat the scalar value
+ __m128i vScale = _mm_set1_epi32(uScale);
+ vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&vScale)[0]);
+ return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMConvertVectorFloatToInt
+(
+ FXMVECTOR VFloat,
+ UINT MulExponent
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT ElementIndex;
+ XMVECTOR Result;
+ FLOAT fScale;
+ XMASSERT(MulExponent<32);
+ // Get the scalar factor.
+ fScale = (FLOAT)(1U << MulExponent);
+ ElementIndex = 0;
+ do {
+ INT iResult;
+ FLOAT fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
+ if (fTemp <= -(65536.0f*32768.0f)) {
+ iResult = (-0x7FFFFFFF)-1;
+ } else if (fTemp > (65536.0f*32768.0f)-128.0f) {
+ iResult = 0x7FFFFFFF;
+ } else {
+ iResult = (INT)fTemp;
+ }
+ Result.vector4_u32[ElementIndex] = (UINT)iResult;
+ } while (++ElementIndex<4);
+ return Result;
+#else // _XM_SSE_INTRINSICS_
+ XMASSERT(MulExponent<32);
+ XMVECTOR vResult = _mm_set_ps1((FLOAT)(1U << MulExponent));
+ vResult = _mm_mul_ps(vResult,VFloat);
+ // In case of positive overflow, detect it
+ XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt);
+ // Float to int conversion
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // If there was positive overflow, set to 0x7FFFFFFF
+ vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
+ vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
+ vOverflow = _mm_or_ps(vOverflow,vResult);
+ return vOverflow;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMConvertVectorUIntToFloat
+(
+ FXMVECTOR VUInt,
+ UINT DivExponent
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT ElementIndex;
+ FLOAT fScale;
+ XMVECTOR Result;
+ XMASSERT(DivExponent<32);
+ fScale = 1.0f / (FLOAT)(1U << DivExponent);
+ ElementIndex = 0;
+ do {
+ Result.vector4_f32[ElementIndex] = (FLOAT)VUInt.vector4_u32[ElementIndex] * fScale;
+ } while (++ElementIndex<4);
+ return Result;
+#else // _XM_SSE_INTRINSICS_
+ XMASSERT(DivExponent<32);
+ // For the values that are higher than 0x7FFFFFFF, a fixup is needed
+ // Determine which ones need the fix.
+ XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero);
+ // Force all values positive
+ XMVECTOR vResult = _mm_xor_ps(VUInt,vMask);
+ // Convert to floats
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Convert 0x80000000 -> 0xFFFFFFFF
+ __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
+ // For only the ones that are too big, add the fixup
+ vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
+ vResult = _mm_add_ps(vResult,vMask);
+ // Convert DivExponent into 1.0f/(1<<DivExponent)
+ UINT uScale = 0x3F800000U - (DivExponent << 23);
+ // Splat
+ iMask = _mm_set1_epi32(uScale);
+ vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&iMask)[0]);
+ return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMConvertVectorFloatToUInt
+(
+ FXMVECTOR VFloat,
+ UINT MulExponent
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT ElementIndex;
+ XMVECTOR Result;
+ FLOAT fScale;
+ XMASSERT(MulExponent<32);
+ // Get the scalar factor.
+ fScale = (FLOAT)(1U << MulExponent);
+ ElementIndex = 0;
+ do {
+ UINT uResult;
+ FLOAT fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
+ if (fTemp <= 0.0f) {
+ uResult = 0;
+ } else if (fTemp >= (65536.0f*65536.0f)) {
+ uResult = 0xFFFFFFFFU;
+ } else {
+ uResult = (UINT)fTemp;
+ }
+ Result.vector4_u32[ElementIndex] = uResult;
+ } while (++ElementIndex<4);
+ return Result;
+#else // _XM_SSE_INTRINSICS_
+ XMASSERT(MulExponent<32);
+ XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
+ vResult = _mm_mul_ps(vResult,VFloat);
+ // Clamp to >=0
+ vResult = _mm_max_ps(vResult,g_XMZero);
+ // Any numbers that are too big, set to 0xFFFFFFFFU
+ XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
+ XMVECTOR vValue = g_XMUnsignedFix;
+ // Too large for a signed integer?
+ XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
+ // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
+ vValue = _mm_and_ps(vValue,vMask);
+ // Perform fixup only on numbers too large (Keeps low bit precision)
+ vResult = _mm_sub_ps(vResult,vValue);
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Convert from signed to unsigned pnly if greater than 0x80000000
+ vMask = _mm_and_ps(vMask,g_XMNegativeZero);
+ vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
+ // On those that are too large, set to 0xFFFFFFFF
+ vResult = _mm_or_ps(vResult,vOverflow);
+ return vResult;
+#endif
+}
+
+#pragma warning(pop)
+
+#endif // _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_
+
+/****************************************************************************
+ *
+ * Vector and matrix load operations
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadInt(CONST UINT* pSource)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 3) == 0);
+
+ V.vector4_u32[0] = *pSource;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 3) == 0);
+
+ return _mm_load_ss( (const float*)pSource );
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadFloat(CONST FLOAT* pSource)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 3) == 0);
+
+ V.vector4_f32[0] = *pSource;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 3) == 0);
+
+ return _mm_load_ss( pSource );
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadInt2
+(
+ CONST UINT* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_u32[0] = pSource[0];
+ V.vector4_u32[1] = pSource[1];
+
+ return V;
+#elif defined(_XM_SSE_INTRINSICS_)
+
+ XMASSERT(pSource);
+
+ __m128 x = _mm_load_ss( (const float*)pSource );
+ __m128 y = _mm_load_ss( (const float*)(pSource+1) );
+ return _mm_unpacklo_ps( x, y );
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadSInt2
+(
+ CONST XMINT2* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR V;
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (float)pSource->x;
+ V.vector4_f32[1] = (float)pSource->y;
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+
+ __m128 x = _mm_load_ss( (const float*)&pSource->x );
+ __m128 y = _mm_load_ss( (const float*)&pSource->y );
+ __m128 V = _mm_unpacklo_ps( x, y );
+ return _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&V)[0]);
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUInt2
+(
+ CONST XMUINT2* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR V;
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (float)pSource->x;
+ V.vector4_f32[1] = (float)pSource->y;
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+
+ __m128 x = _mm_load_ss( (const float*)&pSource->x );
+ __m128 y = _mm_load_ss( (const float*)&pSource->y );
+ __m128 V = _mm_unpacklo_ps( x, y );
+ // For the values that are higher than 0x7FFFFFFF, a fixup is needed
+ // Determine which ones need the fix.
+ XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
+ // Force all values positive
+ XMVECTOR vResult = _mm_xor_ps(V,vMask);
+ // Convert to floats
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Convert 0x80000000 -> 0xFFFFFFFF
+ __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
+ // For only the ones that are too big, add the fixup
+ vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
+ vResult = _mm_add_ps(vResult,vMask);
+ return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadInt2A
+(
+ CONST UINT* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+ V.vector4_u32[0] = pSource[0];
+ V.vector4_u32[1] = pSource[1];
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+ __m128i V = _mm_loadl_epi64( (const __m128i*)pSource );
+ return reinterpret_cast<__m128 *>(&V)[0];
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadFloat2
+(
+ CONST XMFLOAT2* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR V;
+ XMASSERT(pSource);
+
+ ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
+ ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
+ return V;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+
+ __m128 x = _mm_load_ss( &pSource->x );
+ __m128 y = _mm_load_ss( &pSource->y );
+ return _mm_unpacklo_ps( x, y );
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadFloat2A
+(
+ CONST XMFLOAT2A* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+ V.vector4_f32[0] = pSource->x;
+ V.vector4_f32[1] = pSource->y;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+ __m128i V = _mm_loadl_epi64( (const __m128i*)pSource );
+ return reinterpret_cast<__m128 *>(&V)[0];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadHalf2
+(
+ CONST XMHALF2* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMASSERT(pSource);
+ {
+ XMVECTOR vResult = {
+ XMConvertHalfToFloat(pSource->x),
+ XMConvertHalfToFloat(pSource->y),
+ 0.0f,
+ 0.0f
+ };
+ return vResult;
+ }
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ XMVECTOR vResult = {
+ XMConvertHalfToFloat(pSource->x),
+ XMConvertHalfToFloat(pSource->y),
+ 0.0f,
+ 0.0f
+ };
+ return vResult;
+
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadShortN2
+(
+ CONST XMSHORTN2* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMASSERT(pSource);
+ {
+ XMVECTOR vResult = {
+ (pSource->x == -32768) ? -1.f : ((FLOAT)pSource->x * (1.0f/32767.0f)),
+ (pSource->y == -32768) ? -1.f : ((FLOAT)pSource->y * (1.0f/32767.0f)),
+ 0.0f,
+ 0.0f
+ };
+ return vResult;
+ }
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ // Splat the two shorts in all four entries (WORD alignment okay,
+ // DWORD alignment preferred)
+ __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
+ // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
+ vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
+ // x needs to be sign extended
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // x - 0x8000 to undo the signed order.
+ vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
+ // Convert -1.0f - 1.0f
+ vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16);
+ // Clamp result (for case of -32768)
+ return _mm_max_ps( vTemp, g_XMNegativeOne );
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadShort2
+(
+ CONST XMSHORT2* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (FLOAT)pSource->x;
+ V.vector4_f32[1] = (FLOAT)pSource->y;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ // Splat the two shorts in all four entries (WORD alignment okay,
+ // DWORD alignment preferred)
+ __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
+ // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
+ vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
+ // x needs to be sign extended
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // x - 0x8000 to undo the signed order.
+ vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
+ // Y is 65536 too large
+ return _mm_mul_ps(vTemp,g_XMFixupY16);
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUShortN2
+(
+ CONST XMUSHORTN2* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (FLOAT)pSource->x / 65535.0f;
+ V.vector4_f32[1] = (FLOAT)pSource->y / 65535.0f;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 FixupY16 = {1.0f/65535.0f,1.0f/(65535.0f*65536.0f),0.0f,0.0f};
+ static const XMVECTORF32 FixaddY16 = {0,32768.0f*65536.0f,0,0};
+ XMASSERT(pSource);
+ // Splat the two shorts in all four entries (WORD alignment okay,
+ // DWORD alignment preferred)
+ __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
+ // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
+ vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
+ // y needs to be sign flipped
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // y + 0x8000 to undo the signed order.
+ vTemp = _mm_add_ps(vTemp,FixaddY16);
+ // Y is 65536 times too large
+ vTemp = _mm_mul_ps(vTemp,FixupY16);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUShort2
+(
+ CONST XMUSHORT2* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (FLOAT)pSource->x;
+ V.vector4_f32[1] = (FLOAT)pSource->y;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 FixaddY16 = {0,32768.0f,0,0};
+ XMASSERT(pSource);
+ // Splat the two shorts in all four entries (WORD alignment okay,
+ // DWORD alignment preferred)
+ __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
+ // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
+ vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
+ // y needs to be sign flipped
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // Y is 65536 times too large
+ vTemp = _mm_mul_ps(vTemp,g_XMFixupY16);
+ // y + 0x8000 to undo the signed order.
+ vTemp = _mm_add_ps(vTemp,FixaddY16);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadByteN2
+(
+ CONST XMBYTEN2* pSource
+)
+{
+ XMASSERT(pSource);
+ {
+ XMVECTOR vResult = {
+ (pSource->x == -128) ? -1.f : ((FLOAT)pSource->x * (1.0f/127.0f)),
+ (pSource->y == -128) ? -1.f : ((FLOAT)pSource->y * (1.0f/127.0f)),
+ 0.0f,
+ 0.0f
+ };
+ return vResult;
+ }
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadByte2
+(
+ CONST XMBYTE2* pSource
+)
+{
+ XMASSERT(pSource);
+ {
+ XMVECTOR vResult = {
+ (FLOAT)pSource->x,
+ (FLOAT)pSource->y,
+ 0.0f,
+ 0.0f
+ };
+ return vResult;
+ }
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUByteN2
+(
+ CONST XMUBYTEN2* pSource
+)
+{
+ XMASSERT(pSource);
+ {
+ XMVECTOR vResult = {
+ (FLOAT)pSource->x * (1.0f/255.0f),
+ (FLOAT)pSource->y * (1.0f/255.0f),
+ 0.0f,
+ 0.0f
+ };
+ return vResult;
+ }
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUByte2
+(
+ CONST XMUBYTE2* pSource
+)
+{
+ XMASSERT(pSource);
+ {
+ XMVECTOR vResult = {
+ (FLOAT)pSource->x,
+ (FLOAT)pSource->y,
+ 0.0f,
+ 0.0f
+ };
+ return vResult;
+ }
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadInt3
+(
+ CONST UINT* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_u32[0] = pSource[0];
+ V.vector4_u32[1] = pSource[1];
+ V.vector4_u32[2] = pSource[2];
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+
+#ifdef _XM_ISVS2005_
+ __m128i V = _mm_set_epi32( 0, *(pSource+2), *(pSource+1), *pSource );
+ return reinterpret_cast<__m128 *>(&V)[0];
+#else
+ __m128 x = _mm_load_ss( (const float*)pSource );
+ __m128 y = _mm_load_ss( (const float*)(pSource+1) );
+ __m128 z = _mm_load_ss( (const float*)(pSource+2) );
+ __m128 xy = _mm_unpacklo_ps( x, y );
+ return _mm_movelh_ps( xy, z );
+#endif // !_XM_ISVS2005_
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadSInt3
+(
+ CONST XMINT3* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR V;
+ XMASSERT(pSource);
+
+#ifdef _XBOX_VER
+ V = XMLoadInt3( (const UINT*)pSource );
+ return XMConvertVectorIntToFloat( V, 0 );
+#else
+ V.vector4_f32[0] = (float)pSource->x;
+ V.vector4_f32[1] = (float)pSource->y;
+ V.vector4_f32[2] = (float)pSource->z;
+ return V;
+#endif
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+
+#ifdef _XM_ISVS2005_
+ __m128i V = _mm_set_epi32( 0, pSource->z, pSource->y, pSource->x );
+ return _mm_cvtepi32_ps(V);
+#else
+ __m128 x = _mm_load_ss( (const float*)&pSource->x );
+ __m128 y = _mm_load_ss( (const float*)&pSource->y );
+ __m128 z = _mm_load_ss( (const float*)&pSource->z );
+ __m128 xy = _mm_unpacklo_ps( x, y );
+ __m128 V = _mm_movelh_ps( xy, z );
+ return _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&V)[0]);
+#endif // !_XM_ISVS2005_
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUInt3
+(
+ CONST XMUINT3* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR V;
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (float)pSource->x;
+ V.vector4_f32[1] = (float)pSource->y;
+ V.vector4_f32[2] = (float)pSource->z;
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+
+#ifdef _XM_ISVS2005_
+ __m128i V = _mm_set_epi32( 0, pSource->z, pSource->y, pSource->x );
+ // For the values that are higher than 0x7FFFFFFF, a fixup is needed
+ // Determine which ones need the fix.
+ XMVECTOR vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&V)[0],g_XMNegativeZero);
+ // Force all values positive
+ XMVECTOR vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&V)[0],vMask);
+#else
+ __m128 x = _mm_load_ss( (const float*)&pSource->x );
+ __m128 y = _mm_load_ss( (const float*)&pSource->y );
+ __m128 z = _mm_load_ss( (const float*)&pSource->z );
+ __m128 xy = _mm_unpacklo_ps( x, y );
+ __m128 V = _mm_movelh_ps( xy, z );
+ // For the values that are higher than 0x7FFFFFFF, a fixup is needed
+ // Determine which ones need the fix.
+ XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
+ // Force all values positive
+ XMVECTOR vResult = _mm_xor_ps(V,vMask);
+#endif // !_XM_ISVS2005_
+ // Convert to floats
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Convert 0x80000000 -> 0xFFFFFFFF
+ __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
+ // For only the ones that are too big, add the fixup
+ vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
+ vResult = _mm_add_ps(vResult,vMask);
+ return vResult;
+
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadInt3A
+(
+ CONST UINT* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+ V.vector4_u32[0] = pSource[0];
+ V.vector4_u32[1] = pSource[1];
+ V.vector4_u32[2] = pSource[2];
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+
+ // Reads an extra integer that is 'undefined'
+
+ __m128i V = _mm_load_si128( (const __m128i*)pSource );
+ return reinterpret_cast<__m128 *>(&V)[0];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadFloat3
+(
+ CONST XMFLOAT3* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR V;
+ XMASSERT(pSource);
+
+ ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
+ ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
+ ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0];
+ return V;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+
+#ifdef _XM_ISVS2005_
+ // This reads 1 floats past the memory that should be ignored.
+ // Need to continue to do this for VS 2005 due to compiler issue but prefer new method
+ // to avoid triggering issues with memory debug tools (like AV)
+ return _mm_loadu_ps( &pSource->x );
+#else
+ __m128 x = _mm_load_ss( &pSource->x );
+ __m128 y = _mm_load_ss( &pSource->y );
+ __m128 z = _mm_load_ss( &pSource->z );
+ __m128 xy = _mm_unpacklo_ps( x, y );
+ return _mm_movelh_ps( xy, z );
+#endif // !_XM_ISVS2005_
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadFloat3A
+(
+ CONST XMFLOAT3A* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+ V.vector4_f32[0] = pSource->x;
+ V.vector4_f32[1] = pSource->y;
+ V.vector4_f32[2] = pSource->z;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+ // This reads 1 floats past the memory that should be ignored.
+ return _mm_load_ps( &pSource->x );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUHenDN3
+(
+ CONST XMUHENDN3* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+
+ XMASSERT(pSource);
+
+ Element = pSource->v & 0x7FF;
+ V.vector4_f32[0] = (FLOAT)Element / 2047.0f;
+ Element = (pSource->v >> 11) & 0x7FF;
+ V.vector4_f32[1] = (FLOAT)Element / 2047.0f;
+ Element = (pSource->v >> 22) & 0x3FF;
+ V.vector4_f32[2] = (FLOAT)Element / 1023.0f;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 UHenDN3Mul = {1.0f/2047.0f,1.0f/(2047.0f*2048.0f),1.0f/(1023.0f*2048.0f*2048.0f),0};
+ XMASSERT(pSource);
+ // Get the 32 bit value and splat it
+ XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Mask off x, y and z
+ vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
+ // Convert x and y to unsigned
+ vResult = _mm_xor_ps(vResult,g_XMFlipZ);
+ // Convert to float
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Convert x and y back to signed
+ vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
+ // Normalize x,y and z to -1.0f-1.0f
+ vResult = _mm_mul_ps(vResult,UHenDN3Mul);
+ return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUHenD3
+(
+ CONST XMUHEND3* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+
+ XMASSERT(pSource);
+
+ Element = pSource->v & 0x7FF;
+ V.vector4_f32[0] = (FLOAT)Element;
+ Element = (pSource->v >> 11) & 0x7FF;
+ V.vector4_f32[1] = (FLOAT)Element;
+ Element = (pSource->v >> 22) & 0x3FF;
+ V.vector4_f32[2] = (FLOAT)Element;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ // Get the 32 bit value and splat it
+ XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Mask off x, y and z
+ vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
+ // Convert x and y to unsigned
+ vResult = _mm_xor_ps(vResult,g_XMFlipZ);
+ // Convert to float
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Convert x and y back to signed
+ vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
+ // Normalize x and y to -1024-1023.0f and z to -512-511.0f
+ vResult = _mm_mul_ps(vResult,g_XMMulHenD3);
+ return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadHenDN3
+(
+ CONST XMHENDN3* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+ static CONST UINT SignExtendXY[] = {0x00000000, 0xFFFFF800};
+ static CONST UINT SignExtendZ[] = {0x00000000, 0xFFFFFC00};
+
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0x7FF) != 0x400);
+ XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
+ XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
+
+ Element = pSource->v & 0x7FF;
+ V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]) / 1023.0f;
+ Element = (pSource->v >> 11) & 0x7FF;
+ V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]) / 1023.0f;
+ Element = (pSource->v >> 22) & 0x3FF;
+ V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendZ[Element >> 9]) / 511.0f;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 HenDN3Mul = {1.0f/1023.0f,1.0f/(1023.0f*2048.0f),1.0f/(511.0f*2048.0f*2048.0f),0};
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0x7FF) != 0x400);
+ XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
+ XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
+ // Get the 32 bit value and splat it
+ XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Mask off x, y and z
+ vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
+ // Convert x and y to unsigned
+ vResult = _mm_xor_ps(vResult,g_XMXorHenD3);
+ // Convert to float
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Convert x and y back to signed
+ vResult = _mm_add_ps(vResult,g_XMAddHenD3);
+ // Normalize x,y and z to -1.0f-1.0f
+ vResult = _mm_mul_ps(vResult,HenDN3Mul);
+ return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadHenD3
+(
+ CONST XMHEND3* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+ static CONST UINT SignExtendXY[] = {0x00000000, 0xFFFFF800};
+ static CONST UINT SignExtendZ[] = {0x00000000, 0xFFFFFC00};
+
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0x7FF) != 0x400);
+ XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
+ XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
+
+ Element = pSource->v & 0x7FF;
+ V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]);
+ Element = (pSource->v >> 11) & 0x7FF;
+ V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]);
+ Element = (pSource->v >> 22) & 0x3FF;
+ V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendZ[Element >> 9]);
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0x7FF) != 0x400);
+ XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
+ XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
+ // Get the 32 bit value and splat it
+ XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Mask off x, y and z
+ vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
+ // Convert x and y to unsigned
+ vResult = _mm_xor_ps(vResult,g_XMXorHenD3);
+ // Convert to float
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Convert x and y back to signed
+ vResult = _mm_add_ps(vResult,g_XMAddHenD3);
+ // Normalize x and y to -1024-1023.0f and z to -512-511.0f
+ vResult = _mm_mul_ps(vResult,g_XMMulHenD3);
+ return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUDHenN3
+(
+ CONST XMUDHENN3* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+
+ XMASSERT(pSource);
+
+ Element = pSource->v & 0x3FF;
+ V.vector4_f32[0] = (FLOAT)Element / 1023.0f;
+ Element = (pSource->v >> 10) & 0x7FF;
+ V.vector4_f32[1] = (FLOAT)Element / 2047.0f;
+ Element = (pSource->v >> 21) & 0x7FF;
+ V.vector4_f32[2] = (FLOAT)Element / 2047.0f;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 UDHenN3Mul = {1.0f/1023.0f,1.0f/(2047.0f*1024.0f),1.0f/(2047.0f*1024.0f*2048.0f),0};
+ XMASSERT(pSource);
+ // Get the 32 bit value and splat it
+ XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Mask off x, y and z
+ vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
+ // Convert x and y to unsigned
+ vResult = _mm_xor_ps(vResult,g_XMFlipZ);
+ // Convert to float
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Convert x and y back to signed
+ vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
+ // Normalize x,y and z to -1.0f-1.0f
+ vResult = _mm_mul_ps(vResult,UDHenN3Mul);
+ return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUDHen3
+(
+ CONST XMUDHEN3* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+
+ XMASSERT(pSource);
+
+ Element = pSource->v & 0x3FF;
+ V.vector4_f32[0] = (FLOAT)Element;
+ Element = (pSource->v >> 10) & 0x7FF;
+ V.vector4_f32[1] = (FLOAT)Element;
+ Element = (pSource->v >> 21) & 0x7FF;
+ V.vector4_f32[2] = (FLOAT)Element;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ // Get the 32 bit value and splat it
+ XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Mask off x, y and z
+ vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
+ // Convert x and y to unsigned
+ vResult = _mm_xor_ps(vResult,g_XMFlipZ);
+ // Convert to float
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Convert x and y back to signed
+ vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
+ // Normalize x to 0-1023.0f and y and z to 0-2047.0f
+ vResult = _mm_mul_ps(vResult,g_XMMulDHen3);
+ return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadDHenN3
+(
+ CONST XMDHENN3* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+ static CONST UINT SignExtendX[] = {0x00000000, 0xFFFFFC00};
+ static CONST UINT SignExtendYZ[] = {0x00000000, 0xFFFFF800};
+
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
+ XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
+
+ Element = pSource->v & 0x3FF;
+ V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendX[Element >> 9]) / 511.0f;
+ Element = (pSource->v >> 10) & 0x7FF;
+ V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]) / 1023.0f;
+ Element = (pSource->v >> 21) & 0x7FF;
+ V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]) / 1023.0f;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 DHenN3Mul = {1.0f/511.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*2048.0f),0};
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
+ XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
+ // Get the 32 bit value and splat it
+ XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Mask off x, y and z
+ vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
+ // Convert x and y to unsigned
+ vResult = _mm_xor_ps(vResult,g_XMXorDHen3);
+ // Convert to float
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Convert x and y back to signed
+ vResult = _mm_add_ps(vResult,g_XMAddDHen3);
+ // Normalize x,y and z to -1.0f-1.0f
+ vResult = _mm_mul_ps(vResult,DHenN3Mul);
+ return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadDHen3
+(
+ CONST XMDHEN3* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+ static CONST UINT SignExtendX[] = {0x00000000, 0xFFFFFC00};
+ static CONST UINT SignExtendYZ[] = {0x00000000, 0xFFFFF800};
+
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
+ XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
+
+ Element = pSource->v & 0x3FF;
+ V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendX[Element >> 9]);
+ Element = (pSource->v >> 10) & 0x7FF;
+ V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]);
+ Element = (pSource->v >> 21) & 0x7FF;
+ V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]);
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
+ XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
+ // Get the 32 bit value and splat it
+ XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Mask off x, y and z
+ vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
+ // Convert x and y to unsigned
+ vResult = _mm_xor_ps(vResult,g_XMXorDHen3);
+ // Convert to float
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Convert x and y back to signed
+ vResult = _mm_add_ps(vResult,g_XMAddDHen3);
+ // Normalize x to -210-511.0f and y and z to -1024-1023.0f
+ vResult = _mm_mul_ps(vResult,g_XMMulDHen3);
+ return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadU565
+(
+ CONST XMU565* pSource
+)
+{
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+ static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0};
+ static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0};
+ XMASSERT(pSource);
+ // Get the 32 bit value and splat it
+ XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Mask off x, y and z
+ vResult = _mm_and_ps(vResult,U565And);
+ // Convert to float
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Normalize x, y, and z
+ vResult = _mm_mul_ps(vResult,U565Mul);
+ return vResult;
+#else
+ XMVECTOR V;
+ UINT Element;
+
+ XMASSERT(pSource);
+
+ Element = pSource->v & 0x1F;
+ V.vector4_f32[0] = (FLOAT)Element;
+ Element = (pSource->v >> 5) & 0x3F;
+ V.vector4_f32[1] = (FLOAT)Element;
+ Element = (pSource->v >> 11) & 0x1F;
+ V.vector4_f32[2] = (FLOAT)Element;
+
+ return V;
+#endif // !_XM_SSE_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadFloat3PK
+(
+ CONST XMFLOAT3PK* pSource
+)
+{
+ _DECLSPEC_ALIGN_16_ UINT Result[4];
+ UINT Mantissa;
+ UINT Exponent;
+
+ XMASSERT(pSource);
+
+ // X Channel (6-bit mantissa)
+ Mantissa = pSource->xm;
+
+ if ( pSource->xe == 0x1f ) // INF or NAN
+ {
+ Result[0] = 0x7f800000 | (pSource->xm << 17);
+ }
+ else
+ {
+ if ( pSource->xe != 0 ) // The value is normalized
+ {
+ Exponent = pSource->xe;
+ }
+ else if (Mantissa != 0) // The value is denormalized
+ {
+ // Normalize the value in the resulting float
+ Exponent = 1;
+
+ do
+ {
+ Exponent--;
+ Mantissa <<= 1;
+ } while ((Mantissa & 0x40) == 0);
+
+ Mantissa &= 0x3F;
+ }
+ else // The value is zero
+ {
+ Exponent = (UINT)-112;
+ }
+
+ Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17);
+ }
+
+ // Y Channel (6-bit mantissa)
+ Mantissa = pSource->ym;
+
+ if ( pSource->ye == 0x1f ) // INF or NAN
+ {
+ Result[1] = 0x7f800000 | (pSource->ym << 17);
+ }
+ else
+ {
+ if ( pSource->ye != 0 ) // The value is normalized
+ {
+ Exponent = pSource->ye;
+ }
+ else if (Mantissa != 0) // The value is denormalized
+ {
+ // Normalize the value in the resulting float
+ Exponent = 1;
+
+ do
+ {
+ Exponent--;
+ Mantissa <<= 1;
+ } while ((Mantissa & 0x40) == 0);
+
+ Mantissa &= 0x3F;
+ }
+ else // The value is zero
+ {
+ Exponent = (UINT)-112;
+ }
+
+ Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17);
+ }
+
+ // Z Channel (5-bit mantissa)
+ Mantissa = pSource->zm;
+
+ if ( pSource->ze == 0x1f ) // INF or NAN
+ {
+ Result[2] = 0x7f800000 | (pSource->zm << 17);
+ }
+ else
+ {
+ if ( pSource->ze != 0 ) // The value is normalized
+ {
+ Exponent = pSource->ze;
+ }
+ else if (Mantissa != 0) // The value is denormalized
+ {
+ // Normalize the value in the resulting float
+ Exponent = 1;
+
+ do
+ {
+ Exponent--;
+ Mantissa <<= 1;
+ } while ((Mantissa & 0x20) == 0);
+
+ Mantissa &= 0x1F;
+ }
+ else // The value is zero
+ {
+ Exponent = (UINT)-112;
+ }
+
+ Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18);
+ }
+
+ return XMLoadFloat3A( (const XMFLOAT3A*)&Result );
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadFloat3SE
+(
+ CONST XMFLOAT3SE* pSource
+)
+{
+ _DECLSPEC_ALIGN_16_ UINT Result[4];
+ UINT Mantissa;
+ UINT Exponent, ExpBits;
+
+ XMASSERT(pSource);
+
+ if ( pSource->e == 0x1f ) // INF or NAN
+ {
+ Result[0] = 0x7f800000 | (pSource->xm << 14);
+ Result[1] = 0x7f800000 | (pSource->ym << 14);
+ Result[2] = 0x7f800000 | (pSource->zm << 14);
+ }
+ else if ( pSource->e != 0 ) // The values are all normalized
+ {
+ Exponent = pSource->e;
+
+ ExpBits = (Exponent + 112) << 23;
+
+ Mantissa = pSource->xm;
+ Result[0] = ExpBits | (Mantissa << 14);
+
+ Mantissa = pSource->ym;
+ Result[1] = ExpBits | (Mantissa << 14);
+
+ Mantissa = pSource->zm;
+ Result[2] = ExpBits | (Mantissa << 14);
+ }
+ else
+ {
+ // X Channel
+ Mantissa = pSource->xm;
+
+ if (Mantissa != 0) // The value is denormalized
+ {
+ // Normalize the value in the resulting float
+ Exponent = 1;
+
+ do
+ {
+ Exponent--;
+ Mantissa <<= 1;
+ } while ((Mantissa & 0x200) == 0);
+
+ Mantissa &= 0x1FF;
+ }
+ else // The value is zero
+ {
+ Exponent = (UINT)-112;
+ }
+
+ Result[0] = ((Exponent + 112) << 23) | (Mantissa << 14);
+
+ // Y Channel
+ Mantissa = pSource->ym;
+
+ if (Mantissa != 0) // The value is denormalized
+ {
+ // Normalize the value in the resulting float
+ Exponent = 1;
+
+ do
+ {
+ Exponent--;
+ Mantissa <<= 1;
+ } while ((Mantissa & 0x200) == 0);
+
+ Mantissa &= 0x1FF;
+ }
+ else // The value is zero
+ {
+ Exponent = (UINT)-112;
+ }
+
+ Result[1] = ((Exponent + 112) << 23) | (Mantissa << 14);
+
+ // Z Channel
+ Mantissa = pSource->zm;
+
+ if (Mantissa != 0) // The value is denormalized
+ {
+ // Normalize the value in the resulting float
+ Exponent = 1;
+
+ do
+ {
+ Exponent--;
+ Mantissa <<= 1;
+ } while ((Mantissa & 0x200) == 0);
+
+ Mantissa &= 0x1FF;
+ }
+ else // The value is zero
+ {
+ Exponent = (UINT)-112;
+ }
+
+ Result[2] = ((Exponent + 112) << 23) | (Mantissa << 14);
+ }
+
+ return XMLoadFloat3A( (const XMFLOAT3A*)&Result );
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadInt4
+(
+ CONST UINT* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_u32[0] = pSource[0];
+ V.vector4_u32[1] = pSource[1];
+ V.vector4_u32[2] = pSource[2];
+ V.vector4_u32[3] = pSource[3];
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+
+ XMASSERT(pSource);
+
+ __m128i V = _mm_loadu_si128( (const __m128i*)pSource );
+ return reinterpret_cast<__m128 *>(&V)[0];
+
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadSInt4
+(
+ CONST XMINT4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR V;
+ XMASSERT(pSource);
+
+#ifdef _XBOX_VER
+ V = XMLoadInt4( (const UINT*)pSource );
+ return XMConvertVectorIntToFloat( V, 0 );
+#else
+ V.vector4_f32[0] = (float)pSource->x;
+ V.vector4_f32[1] = (float)pSource->y;
+ V.vector4_f32[2] = (float)pSource->z;
+ V.vector4_f32[3] = (float)pSource->w;
+ return V;
+#endif
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ __m128i V = _mm_loadu_si128( (const __m128i*)pSource );
+ return _mm_cvtepi32_ps(V);
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUInt4
+(
+ CONST XMUINT4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR V;
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (float)pSource->x;
+ V.vector4_f32[1] = (float)pSource->y;
+ V.vector4_f32[2] = (float)pSource->z;
+ V.vector4_f32[3] = (float)pSource->w;
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ __m128i V = _mm_loadu_si128( (const __m128i*)pSource );
+ // For the values that are higher than 0x7FFFFFFF, a fixup is needed
+ // Determine which ones need the fix.
+ XMVECTOR vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&V)[0],g_XMNegativeZero);
+ // Force all values positive
+ XMVECTOR vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&V)[0],vMask);
+ // Convert to floats
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Convert 0x80000000 -> 0xFFFFFFFF
+ __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
+ // For only the ones that are too big, add the fixup
+ vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
+ vResult = _mm_add_ps(vResult,vMask);
+ return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadInt4A
+(
+ CONST UINT* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+ V.vector4_u32[0] = pSource[0];
+ V.vector4_u32[1] = pSource[1];
+ V.vector4_u32[2] = pSource[2];
+ V.vector4_u32[3] = pSource[3];
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+ __m128i V = _mm_load_si128( (const __m128i*)pSource );
+ return reinterpret_cast<__m128 *>(&V)[0];
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadFloat4
+(
+ CONST XMFLOAT4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR V;
+ XMASSERT(pSource);
+
+ ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
+ ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
+ ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0];
+ ((UINT *)(&V.vector4_f32[3]))[0] = ((const UINT *)(&pSource->w))[0];
+ return V;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+
+ return _mm_loadu_ps( &pSource->x );
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadFloat4A
+(
+ CONST XMFLOAT4A* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+ V.vector4_f32[0] = pSource->x;
+ V.vector4_f32[1] = pSource->y;
+ V.vector4_f32[2] = pSource->z;
+ V.vector4_f32[3] = pSource->w;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+ return _mm_load_ps( &pSource->x );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadHalf4
+(
+ CONST XMHALF4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMASSERT(pSource);
+ {
+ XMVECTOR vResult = {
+ XMConvertHalfToFloat(pSource->x),
+ XMConvertHalfToFloat(pSource->y),
+ XMConvertHalfToFloat(pSource->z),
+ XMConvertHalfToFloat(pSource->w)
+ };
+ return vResult;
+ }
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ XMVECTOR vResult = {
+ XMConvertHalfToFloat(pSource->x),
+ XMConvertHalfToFloat(pSource->y),
+ XMConvertHalfToFloat(pSource->z),
+ XMConvertHalfToFloat(pSource->w)
+ };
+ return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadShortN4
+(
+ CONST XMSHORTN4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMASSERT(pSource);
+ {
+ XMVECTOR vResult = {
+ (pSource->x == -32768) ? -1.f : ((FLOAT)pSource->x * (1.0f/32767.0f)),
+ (pSource->y == -32768) ? -1.f : ((FLOAT)pSource->y * (1.0f/32767.0f)),
+ (pSource->z == -32768) ? -1.f : ((FLOAT)pSource->z * (1.0f/32767.0f)),
+ (pSource->w == -32768) ? -1.f : ((FLOAT)pSource->w * (1.0f/32767.0f))
+ };
+ return vResult;
+ }
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ // Splat the color in all four entries (x,z,y,w)
+ __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
+ // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
+ __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
+ // x and z are unsigned! Flip the bits to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // x and z - 0x8000 to complete the conversion
+ vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
+ // Convert to -1.0f - 1.0f
+ vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16);
+ // Very important! The entries are x,z,y,w, flip it to x,y,z,w
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
+ // Clamp result (for case of -32768)
+ return _mm_max_ps( vTemp, g_XMNegativeOne );
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadShort4
+(
+ CONST XMSHORT4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (FLOAT)pSource->x;
+ V.vector4_f32[1] = (FLOAT)pSource->y;
+ V.vector4_f32[2] = (FLOAT)pSource->z;
+ V.vector4_f32[3] = (FLOAT)pSource->w;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ // Splat the color in all four entries (x,z,y,w)
+ __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
+ // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
+ __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
+ // x and z are unsigned! Flip the bits to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // x and z - 0x8000 to complete the conversion
+ vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
+ // Fix y and w because they are 65536 too large
+ vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
+ // Very important! The entries are x,z,y,w, flip it to x,y,z,w
+ return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUShortN4
+(
+ CONST XMUSHORTN4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (FLOAT)pSource->x / 65535.0f;
+ V.vector4_f32[1] = (FLOAT)pSource->y / 65535.0f;
+ V.vector4_f32[2] = (FLOAT)pSource->z / 65535.0f;
+ V.vector4_f32[3] = (FLOAT)pSource->w / 65535.0f;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ static const XMVECTORF32 FixupY16W16 = {1.0f/65535.0f,1.0f/65535.0f,1.0f/(65535.0f*65536.0f),1.0f/(65535.0f*65536.0f)};
+ static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f*65536.0f,32768.0f*65536.0f};
+ XMASSERT(pSource);
+ // Splat the color in all four entries (x,z,y,w)
+ __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
+ // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
+ __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
+ // y and w are signed! Flip the bits to convert the order to unsigned
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // y and w + 0x8000 to complete the conversion
+ vTemp = _mm_add_ps(vTemp,FixaddY16W16);
+ // Fix y and w because they are 65536 too large
+ vTemp = _mm_mul_ps(vTemp,FixupY16W16);
+ // Very important! The entries are x,z,y,w, flip it to x,y,z,w
+ return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUShort4
+(
+ CONST XMUSHORT4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (FLOAT)pSource->x;
+ V.vector4_f32[1] = (FLOAT)pSource->y;
+ V.vector4_f32[2] = (FLOAT)pSource->z;
+ V.vector4_f32[3] = (FLOAT)pSource->w;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f,32768.0f};
+ XMASSERT(pSource);
+ // Splat the color in all four entries (x,z,y,w)
+ __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
+ // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
+ __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
+ // y and w are signed! Flip the bits to convert the order to unsigned
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // Fix y and w because they are 65536 too large
+ vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
+ // y and w + 0x8000 to complete the conversion
+ vTemp = _mm_add_ps(vTemp,FixaddY16W16);
+ // Very important! The entries are x,z,y,w, flip it to x,y,z,w
+ return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadXIcoN4
+(
+ CONST XMXICON4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+ static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
+
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
+ XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
+ XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
+
+ Element = (UINT)(pSource->v & 0xFFFFF);
+ V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
+ Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
+ V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
+ Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
+ V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
+ V.vector4_f32[3] = (FLOAT)(pSource->v >> 60) / 15.0f;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
+ XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
+ XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
+ static const XMVECTORF32 LoadXIcoN4Mul = {1.0f/524287.0f,1.0f/(524287.0f*4096.0f),1.0f/524287.0f,1.0f/(15.0f*4096.0f*65536.0f)};
+ XMASSERT(pSource);
+ // Grab the 64 bit structure
+ __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
+ // By shifting down 8 bits, y and z are in seperate 32 bit elements
+ __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
+ // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
+ XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
+ // Fix the entries to x,y,z,w
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
+ // Mask x,y,z and w
+ vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
+ // x and z are unsigned! Flip the bits to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMXorXIco4);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // x and z - 0x80 to complete the conversion
+ vTemp = _mm_add_ps(vTemp,g_XMAddXIco4);
+ // Fix y and w because they are too large
+ vTemp = _mm_mul_ps(vTemp,LoadXIcoN4Mul);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadXIco4
+(
+ CONST XMXICO4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+ static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
+
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
+ XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
+ XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
+
+ Element = (UINT)(pSource->v & 0xFFFFF);
+ V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
+ Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
+ V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
+ Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
+ V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
+ V.vector4_f32[3] = (FLOAT)(pSource->v >> 60);
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
+ XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
+ XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
+ XMASSERT(pSource);
+ // Grab the 64 bit structure
+ __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
+ // By shifting down 8 bits, y and z are in seperate 32 bit elements
+ __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
+ // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
+ XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
+ // Fix the entries to x,y,z,w
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
+ // Mask x,y,z and w
+ vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
+ // x and z are unsigned! Flip the bits to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMXorXIco4);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // x and z - 0x80 to complete the conversion
+ vTemp = _mm_add_ps(vTemp,g_XMAddXIco4);
+ // Fix y and w because they are too large
+ vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUIcoN4
+(
+ CONST XMUICON4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (FLOAT)(pSource->v & 0xFFFFF) / 1048575.0f;
+ V.vector4_f32[1] = (FLOAT)((pSource->v >> 20) & 0xFFFFF) / 1048575.0f;
+ V.vector4_f32[2] = (FLOAT)((pSource->v >> 40) & 0xFFFFF) / 1048575.0f;
+ V.vector4_f32[3] = (FLOAT)(pSource->v >> 60) / 15.0f;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 LoadUIcoN4Mul = {1.0f/1048575.0f,1.0f/(1048575.0f*4096.0f),1.0f/1048575.0f,1.0f/(15.0f*4096.0f*65536.0f)};
+ XMASSERT(pSource);
+ // Grab the 64 bit structure
+ __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
+ // By shifting down 8 bits, y and z are in seperate 32 bit elements
+ __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
+ // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
+ XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
+ // Fix the entries to x,y,z,w
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
+ // Mask x,y,z and w
+ vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
+ // x and z are unsigned! Flip the bits to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipYW);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // x and z - 0x80 to complete the conversion
+ vTemp = _mm_add_ps(vTemp,g_XMAddUIco4);
+ // Fix y and w because they are too large
+ vTemp = _mm_mul_ps(vTemp,LoadUIcoN4Mul);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUIco4
+(
+ CONST XMUICO4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (FLOAT)(pSource->v & 0xFFFFF);
+ V.vector4_f32[1] = (FLOAT)((pSource->v >> 20) & 0xFFFFF);
+ V.vector4_f32[2] = (FLOAT)((pSource->v >> 40) & 0xFFFFF);
+ V.vector4_f32[3] = (FLOAT)(pSource->v >> 60);
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ // Grab the 64 bit structure
+ __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
+ // By shifting down 8 bits, y and z are in seperate 32 bit elements
+ __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
+ // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
+ XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
+ // Fix the entries to x,y,z,w
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
+ // Mask x,y,z and w
+ vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
+ // x and z are unsigned! Flip the bits to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipYW);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // x and z - 0x80 to complete the conversion
+ vTemp = _mm_add_ps(vTemp,g_XMAddUIco4);
+ // Fix y and w because they are too large
+ vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadIcoN4
+(
+ CONST XMICON4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+ static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
+ static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFF0};
+
+ XMASSERT(pSource);
+
+ Element = (UINT)(pSource->v & 0xFFFFF);
+ V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
+ Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
+ V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
+ Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
+ V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
+ Element = (UINT)(pSource->v >> 60);
+ V.vector4_f32[3] = (FLOAT)(INT)(Element | SignExtendW[Element >> 3]) / 7.0f;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 LoadIcoN4Mul = {1.0f/524287.0f,1.0f/(524287.0f*4096.0f),1.0f/524287.0f,1.0f/(7.0f*4096.0f*65536.0f)};
+ XMASSERT(pSource);
+ // Grab the 64 bit structure
+ __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
+ // By shifting down 8 bits, y and z are in seperate 32 bit elements
+ __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
+ // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
+ XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
+ // Fix the entries to x,y,z,w
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
+ // Mask x,y,z and w
+ vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
+ // x and z are unsigned! Flip the bits to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMXorIco4);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // x and z - 0x80 to complete the conversion
+ vTemp = _mm_add_ps(vTemp,g_XMAddIco4);
+ // Fix y and w because they are too large
+ vTemp = _mm_mul_ps(vTemp,LoadIcoN4Mul);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadIco4
+(
+ CONST XMICO4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+ static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
+ static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFF0};
+
+ XMASSERT(pSource);
+
+ Element = (UINT)(pSource->v & 0xFFFFF);
+ V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
+ Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
+ V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
+ Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
+ V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
+ Element = (UINT)(pSource->v >> 60);
+ V.vector4_f32[3] = (FLOAT)(INT)(Element | SignExtendW[Element >> 3]);
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ // Grab the 64 bit structure
+ __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
+ // By shifting down 8 bits, y and z are in seperate 32 bit elements
+ __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
+ // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
+ XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
+ // Fix the entries to x,y,z,w
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
+ // Mask x,y,z and w
+ vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
+ // x and z are unsigned! Flip the bits to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMXorIco4);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // x and z - 0x80 to complete the conversion
+ vTemp = _mm_add_ps(vTemp,g_XMAddIco4);
+ // Fix y and w because they are too large
+ vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadXDecN4
+(
+ CONST XMXDECN4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR V;
+ UINT Element;
+ static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
+
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
+
+ Element = pSource->v & 0x3FF;
+ V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
+ Element = (pSource->v >> 10) & 0x3FF;
+ V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
+ Element = (pSource->v >> 20) & 0x3FF;
+ V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
+ V.vector4_f32[3] = (FLOAT)(pSource->v >> 30) / 3.0f;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ // Splat the color in all four entries
+ __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+ vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10);
+ // a is unsigned! Flip the bit to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // RGB + 0, A + 0x80000000.f to undo the signed order.
+ vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10);
+ // Convert 0-255 to 0.0f-1.0f
+ return _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10);
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadXDec4
+(
+ CONST XMXDEC4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+ static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
+
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
+
+ Element = pSource->v & 0x3FF;
+ V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
+ Element = (pSource->v >> 10) & 0x3FF;
+ V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
+ Element = (pSource->v >> 20) & 0x3FF;
+ V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
+ V.vector4_f32[3] = (FLOAT)(pSource->v >> 30);
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT((pSource->v & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
+ static const XMVECTORI32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000};
+ static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f};
+ XMASSERT(pSource);
+ // Splat the color in all four entries
+ XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+ vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
+ // a is unsigned! Flip the bit to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,XDec4Xor);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // RGB + 0, A + 0x80000000.f to undo the signed order.
+ vTemp = _mm_add_ps(vTemp,XDec4Add);
+ // Convert 0-255 to 0.0f-1.0f
+ vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUDecN4
+(
+ CONST XMUDECN4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+
+ XMASSERT(pSource);
+
+ Element = pSource->v & 0x3FF;
+ V.vector4_f32[0] = (FLOAT)Element / 1023.0f;
+ Element = (pSource->v >> 10) & 0x3FF;
+ V.vector4_f32[1] = (FLOAT)Element / 1023.0f;
+ Element = (pSource->v >> 20) & 0x3FF;
+ V.vector4_f32[2] = (FLOAT)Element / 1023.0f;
+ V.vector4_f32[3] = (FLOAT)(pSource->v >> 30) / 3.0f;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)};
+ // Splat the color in all four entries
+ XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+ vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
+ // a is unsigned! Flip the bit to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // RGB + 0, A + 0x80000000.f to undo the signed order.
+ vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
+ // Convert 0-255 to 0.0f-1.0f
+ vTemp = _mm_mul_ps(vTemp,UDecN4Mul);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUDec4
+(
+ CONST XMUDEC4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+
+ XMASSERT(pSource);
+
+ Element = pSource->v & 0x3FF;
+ V.vector4_f32[0] = (FLOAT)Element;
+ Element = (pSource->v >> 10) & 0x3FF;
+ V.vector4_f32[1] = (FLOAT)Element;
+ Element = (pSource->v >> 20) & 0x3FF;
+ V.vector4_f32[2] = (FLOAT)Element;
+ V.vector4_f32[3] = (FLOAT)(pSource->v >> 30);
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ // Splat the color in all four entries
+ XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+ vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
+ // a is unsigned! Flip the bit to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // RGB + 0, A + 0x80000000.f to undo the signed order.
+ vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
+ // Convert 0-255 to 0.0f-1.0f
+ vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadDecN4
+(
+ CONST XMDECN4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+ static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
+ static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFFC};
+
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
+
+ Element = pSource->v & 0x3FF;
+ V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
+ Element = (pSource->v >> 10) & 0x3FF;
+ V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
+ Element = (pSource->v >> 20) & 0x3FF;
+ V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
+ Element = pSource->v >> 30;
+ V.vector4_f32[3] = (FLOAT)(SHORT)(Element | SignExtendW[Element >> 1]);
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
+ static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)};
+ // Splat the color in all four entries
+ XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+ vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
+ // a is unsigned! Flip the bit to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // RGB + 0, A + 0x80000000.f to undo the signed order.
+ vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
+ // Convert 0-255 to 0.0f-1.0f
+ vTemp = _mm_mul_ps(vTemp,DecN4Mul);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadDec4
+(
+ CONST XMDEC4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ UINT Element;
+ static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
+ static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFFC};
+
+ XMASSERT(pSource);
+ XMASSERT((pSource->v & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
+
+ Element = pSource->v & 0x3FF;
+ V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
+ Element = (pSource->v >> 10) & 0x3FF;
+ V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
+ Element = (pSource->v >> 20) & 0x3FF;
+ V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
+ Element = pSource->v >> 30;
+ V.vector4_f32[3] = (FLOAT)(SHORT)(Element | SignExtendW[Element >> 1]);
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT((pSource->v & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
+ XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
+ XMASSERT(pSource);
+ // Splat the color in all four entries
+ XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+ vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
+ // a is unsigned! Flip the bit to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // RGB + 0, A + 0x80000000.f to undo the signed order.
+ vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
+ // Convert 0-255 to 0.0f-1.0f
+ vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUByteN4
+(
+ CONST XMUBYTEN4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (FLOAT)pSource->x / 255.0f;
+ V.vector4_f32[1] = (FLOAT)pSource->y / 255.0f;
+ V.vector4_f32[2] = (FLOAT)pSource->z / 255.0f;
+ V.vector4_f32[3] = (FLOAT)pSource->w / 255.0f;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 LoadUByteN4Mul = {1.0f/255.0f,1.0f/(255.0f*256.0f),1.0f/(255.0f*65536.0f),1.0f/(255.0f*65536.0f*256.0f)};
+ XMASSERT(pSource);
+ // Splat the color in all four entries (x,z,y,w)
+ XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
+ // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
+ vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
+ // w is signed! Flip the bits to convert the order to unsigned
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // w + 0x80 to complete the conversion
+ vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
+ // Fix y, z and w because they are too large
+ vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUByte4
+(
+ CONST XMUBYTE4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (FLOAT)pSource->x;
+ V.vector4_f32[1] = (FLOAT)pSource->y;
+ V.vector4_f32[2] = (FLOAT)pSource->z;
+ V.vector4_f32[3] = (FLOAT)pSource->w;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 LoadUByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
+ XMASSERT(pSource);
+ // Splat the color in all four entries (x,z,y,w)
+ XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
+ // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
+ vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
+ // w is signed! Flip the bits to convert the order to unsigned
+ vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // w + 0x80 to complete the conversion
+ vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
+ // Fix y, z and w because they are too large
+ vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadByteN4
+(
+ CONST XMBYTEN4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (pSource->x == -128) ? -1.f : ((FLOAT)pSource->x / 127.0f);
+ V.vector4_f32[1] = (pSource->y == -128) ? -1.f : ((FLOAT)pSource->y / 127.0f);
+ V.vector4_f32[2] = (pSource->z == -128) ? -1.f : ((FLOAT)pSource->z / 127.0f);
+ V.vector4_f32[3] = (pSource->w == -128) ? -1.f : ((FLOAT)pSource->w / 127.0f);
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 LoadByteN4Mul = {1.0f/127.0f,1.0f/(127.0f*256.0f),1.0f/(127.0f*65536.0f),1.0f/(127.0f*65536.0f*256.0f)};
+ XMASSERT(pSource);
+ // Splat the color in all four entries (x,z,y,w)
+ XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
+ // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
+ vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
+ // x,y and z are unsigned! Flip the bits to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // x, y and z - 0x80 to complete the conversion
+ vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
+ // Fix y, z and w because they are too large
+ vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul);
+ // Clamp result (for case of -128)
+ return _mm_max_ps( vTemp, g_XMNegativeOne );
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadByte4
+(
+ CONST XMBYTE4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+
+ XMASSERT(pSource);
+
+ V.vector4_f32[0] = (FLOAT)pSource->x;
+ V.vector4_f32[1] = (FLOAT)pSource->y;
+ V.vector4_f32[2] = (FLOAT)pSource->z;
+ V.vector4_f32[3] = (FLOAT)pSource->w;
+
+ return V;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 LoadByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
+ XMASSERT(pSource);
+ // Splat the color in all four entries (x,z,y,w)
+ XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
+ // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
+ vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
+ // x,y and z are unsigned! Flip the bits to convert the order to signed
+ vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
+ // Convert to floating point numbers
+ vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
+ // x, y and z - 0x80 to complete the conversion
+ vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
+ // Fix y, z and w because they are too large
+ vTemp = _mm_mul_ps(vTemp,LoadByte4Mul);
+ return vTemp;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadUNibble4
+(
+ CONST XMUNIBBLE4* pSource
+)
+{
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+ static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000};
+ static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f};
+ XMASSERT(pSource);
+ // Get the 32 bit value and splat it
+ XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Mask off x, y and z
+ vResult = _mm_and_ps(vResult,UNibble4And);
+ // Convert to float
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Normalize x, y, and z
+ vResult = _mm_mul_ps(vResult,UNibble4Mul);
+ return vResult;
+#else
+ XMVECTOR V;
+ UINT Element;
+
+ XMASSERT(pSource);
+
+ Element = pSource->v & 0xF;
+ V.vector4_f32[0] = (FLOAT)Element;
+ Element = (pSource->v >> 4) & 0xF;
+ V.vector4_f32[1] = (FLOAT)Element;
+ Element = (pSource->v >> 8) & 0xF;
+ V.vector4_f32[2] = (FLOAT)Element;
+ Element = (pSource->v >> 12) & 0xF;
+ V.vector4_f32[3] = (FLOAT)Element;
+
+ return V;
+#endif // !_XM_SSE_INTRISICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadU555
+(
+ CONST XMU555* pSource
+)
+{
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+ static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000};
+ static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f};
+ XMASSERT(pSource);
+ // Get the 32 bit value and splat it
+ XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
+ // Mask off x, y and z
+ vResult = _mm_and_ps(vResult,U555And);
+ // Convert to float
+ vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
+ // Normalize x, y, and z
+ vResult = _mm_mul_ps(vResult,U555Mul);
+ return vResult;
+#else
+ XMVECTOR V;
+ UINT Element;
+
+ XMASSERT(pSource);
+
+ Element = pSource->v & 0x1F;
+ V.vector4_f32[0] = (FLOAT)Element;
+ Element = (pSource->v >> 5) & 0x1F;
+ V.vector4_f32[1] = (FLOAT)Element;
+ Element = (pSource->v >> 10) & 0x1F;
+ V.vector4_f32[2] = (FLOAT)Element;
+ Element = (pSource->v >> 15) & 0x1;
+ V.vector4_f32[3] = (FLOAT)Element;
+
+ return V;
+#endif // !_XM_SSE_INTRISICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMLoadColor
+(
+ CONST XMCOLOR* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMASSERT(pSource);
+ {
+ // INT -> Float conversions are done in one instruction.
+ // UINT -> Float calls a runtime function. Keep in INT
+ INT iColor = (INT)(pSource->c);
+ XMVECTOR vColor = {
+ (FLOAT)((iColor >> 16) & 0xFF) * (1.0f/255.0f),
+ (FLOAT)((iColor >> 8) & 0xFF) * (1.0f/255.0f),
+ (FLOAT)(iColor & 0xFF) * (1.0f/255.0f),
+ (FLOAT)((iColor >> 24) & 0xFF) * (1.0f/255.0f)
+ };
+ return vColor;
+ }
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ // Splat the color in all four entries
+ __m128i vInt = _mm_set1_epi32(pSource->c);
+ // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+ vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8);
+ // a is unsigned! Flip the bit to convert the order to signed
+ vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8);
+ // Convert to floating point numbers
+ XMVECTOR vTemp = _mm_cvtepi32_ps(vInt);
+ // RGB + 0, A + 0x80000000.f to undo the signed order.
+ vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8);
+ // Convert 0-255 to 0.0f-1.0f
+ return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8);
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMLoadFloat3x3
+(
+ CONST XMFLOAT3X3* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX M;
+
+ XMASSERT(pSource);
+
+ M.r[0].vector4_f32[0] = pSource->m[0][0];
+ M.r[0].vector4_f32[1] = pSource->m[0][1];
+ M.r[0].vector4_f32[2] = pSource->m[0][2];
+ M.r[0].vector4_f32[3] = 0.0f;
+
+ M.r[1].vector4_f32[0] = pSource->m[1][0];
+ M.r[1].vector4_f32[1] = pSource->m[1][1];
+ M.r[1].vector4_f32[2] = pSource->m[1][2];
+ M.r[1].vector4_f32[3] = 0.0f;
+
+ M.r[2].vector4_f32[0] = pSource->m[2][0];
+ M.r[2].vector4_f32[1] = pSource->m[2][1];
+ M.r[2].vector4_f32[2] = pSource->m[2][2];
+ M.r[2].vector4_f32[3] = 0.0f;
+
+ M.r[3].vector4_f32[0] = 0.0f;
+ M.r[3].vector4_f32[1] = 0.0f;
+ M.r[3].vector4_f32[2] = 0.0f;
+ M.r[3].vector4_f32[3] = 1.0f;
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+ XMVECTOR V1, V2, V3, Z, T1, T2, T3, T4, T5;
+
+ Z = _mm_setzero_ps();
+
+ XMASSERT(pSource);
+
+ V1 = _mm_loadu_ps( &pSource->m[0][0] );
+ V2 = _mm_loadu_ps( &pSource->m[1][1] );
+ V3 = _mm_load_ss( &pSource->m[2][2] );
+
+ T1 = _mm_unpackhi_ps( V1, Z );
+ T2 = _mm_unpacklo_ps( V2, Z );
+ T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) );
+ T4 = _mm_movehl_ps( T2, T3 );
+ T5 = _mm_movehl_ps( Z, T1 );
+
+ M.r[0] = _mm_movelh_ps( V1, T1 );
+ M.r[1] = _mm_add_ps( T4, T5 );
+ M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) );
+ M.r[3] = g_XMIdentityR3;
+
+ return M;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMLoadFloat4x3
+(
+ CONST XMFLOAT4X3* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMMATRIX M;
+ XMASSERT(pSource);
+
+ ((UINT *)(&M.r[0].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[0][0]))[0];
+ ((UINT *)(&M.r[0].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[0][1]))[0];
+ ((UINT *)(&M.r[0].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[0][2]))[0];
+ M.r[0].vector4_f32[3] = 0.0f;
+
+ ((UINT *)(&M.r[1].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[1][0]))[0];
+ ((UINT *)(&M.r[1].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[1][1]))[0];
+ ((UINT *)(&M.r[1].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[1][2]))[0];
+ M.r[1].vector4_f32[3] = 0.0f;
+
+ ((UINT *)(&M.r[2].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[2][0]))[0];
+ ((UINT *)(&M.r[2].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[2][1]))[0];
+ ((UINT *)(&M.r[2].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[2][2]))[0];
+ M.r[2].vector4_f32[3] = 0.0f;
+
+ ((UINT *)(&M.r[3].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[3][0]))[0];
+ ((UINT *)(&M.r[3].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[3][1]))[0];
+ ((UINT *)(&M.r[3].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[3][2]))[0];
+ M.r[3].vector4_f32[3] = 1.0f;
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ // Use unaligned load instructions to
+ // load the 12 floats
+ // vTemp1 = x1,y1,z1,x2
+ XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
+ // vTemp2 = y2,z2,x3,y3
+ XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
+ // vTemp4 = z3,x4,y4,z4
+ XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
+ // vTemp3 = x3,y3,z3,z3
+ XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
+ // vTemp2 = y2,z2,x2,x2
+ vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
+ // vTemp2 = x2,y2,z2,z2
+ vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
+ // vTemp1 = x1,y1,z1,0
+ vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
+ // vTemp2 = x2,y2,z2,0
+ vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
+ // vTemp3 = x3,y3,z3,0
+ vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
+ // vTemp4i = x4,y4,z4,0
+ __m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
+ // vTemp4i = x4,y4,z4,1.0f
+ vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
+ XMMATRIX M(vTemp1,
+ vTemp2,
+ vTemp3,
+ reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
+ return M;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMLoadFloat4x3A
+(
+ CONST XMFLOAT4X3A* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX M;
+
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+ M.r[0].vector4_f32[0] = pSource->m[0][0];
+ M.r[0].vector4_f32[1] = pSource->m[0][1];
+ M.r[0].vector4_f32[2] = pSource->m[0][2];
+ M.r[0].vector4_f32[3] = 0.0f;
+
+ M.r[1].vector4_f32[0] = pSource->m[1][0];
+ M.r[1].vector4_f32[1] = pSource->m[1][1];
+ M.r[1].vector4_f32[2] = pSource->m[1][2];
+ M.r[1].vector4_f32[3] = 0.0f;
+
+ M.r[2].vector4_f32[0] = pSource->m[2][0];
+ M.r[2].vector4_f32[1] = pSource->m[2][1];
+ M.r[2].vector4_f32[2] = pSource->m[2][2];
+ M.r[2].vector4_f32[3] = 0.0f;
+
+ M.r[3].vector4_f32[0] = pSource->m[3][0];
+ M.r[3].vector4_f32[1] = pSource->m[3][1];
+ M.r[3].vector4_f32[2] = pSource->m[3][2];
+ M.r[3].vector4_f32[3] = 1.0f;
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ // Use aligned load instructions to
+ // load the 12 floats
+ // vTemp1 = x1,y1,z1,x2
+ XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
+ // vTemp2 = y2,z2,x3,y3
+ XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
+ // vTemp4 = z3,x4,y4,z4
+ XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
+ // vTemp3 = x3,y3,z3,z3
+ XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
+ // vTemp2 = y2,z2,x2,x2
+ vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
+ // vTemp2 = x2,y2,z2,z2
+ vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
+ // vTemp1 = x1,y1,z1,0
+ vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
+ // vTemp2 = x2,y2,z2,0
+ vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
+ // vTemp3 = x3,y3,z3,0
+ vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
+ // vTemp4i = x4,y4,z4,0
+ __m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
+ // vTemp4i = x4,y4,z4,1.0f
+ vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
+ XMMATRIX M(vTemp1,
+ vTemp2,
+ vTemp3,
+ reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMLoadFloat4x4
+(
+ CONST XMFLOAT4X4* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMMATRIX M;
+ XMASSERT(pSource);
+
+ ((UINT *)(&M.r[0].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[0][0]))[0];
+ ((UINT *)(&M.r[0].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[0][1]))[0];
+ ((UINT *)(&M.r[0].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[0][2]))[0];
+ ((UINT *)(&M.r[0].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[0][3]))[0];
+
+ ((UINT *)(&M.r[1].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[1][0]))[0];
+ ((UINT *)(&M.r[1].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[1][1]))[0];
+ ((UINT *)(&M.r[1].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[1][2]))[0];
+ ((UINT *)(&M.r[1].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[1][3]))[0];
+
+ ((UINT *)(&M.r[2].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[2][0]))[0];
+ ((UINT *)(&M.r[2].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[2][1]))[0];
+ ((UINT *)(&M.r[2].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[2][2]))[0];
+ ((UINT *)(&M.r[2].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[2][3]))[0];
+
+ ((UINT *)(&M.r[3].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[3][0]))[0];
+ ((UINT *)(&M.r[3].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[3][1]))[0];
+ ((UINT *)(&M.r[3].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[3][2]))[0];
+ ((UINT *)(&M.r[3].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[3][3]))[0];
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSource);
+ XMMATRIX M;
+
+ M.r[0] = _mm_loadu_ps( &pSource->_11 );
+ M.r[1] = _mm_loadu_ps( &pSource->_21 );
+ M.r[2] = _mm_loadu_ps( &pSource->_31 );
+ M.r[3] = _mm_loadu_ps( &pSource->_41 );
+
+ return M;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMLoadFloat4x4A
+(
+ CONST XMFLOAT4X4A* pSource
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX M;
+
+ XMASSERT(pSource);
+ XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
+
+ M.r[0].vector4_f32[0] = pSource->m[0][0];
+ M.r[0].vector4_f32[1] = pSource->m[0][1];
+ M.r[0].vector4_f32[2] = pSource->m[0][2];
+ M.r[0].vector4_f32[3] = pSource->m[0][3];
+
+ M.r[1].vector4_f32[0] = pSource->m[1][0];
+ M.r[1].vector4_f32[1] = pSource->m[1][1];
+ M.r[1].vector4_f32[2] = pSource->m[1][2];
+ M.r[1].vector4_f32[3] = pSource->m[1][3];
+
+ M.r[2].vector4_f32[0] = pSource->m[2][0];
+ M.r[2].vector4_f32[1] = pSource->m[2][1];
+ M.r[2].vector4_f32[2] = pSource->m[2][2];
+ M.r[2].vector4_f32[3] = pSource->m[2][3];
+
+ M.r[3].vector4_f32[0] = pSource->m[3][0];
+ M.r[3].vector4_f32[1] = pSource->m[3][1];
+ M.r[3].vector4_f32[2] = pSource->m[3][2];
+ M.r[3].vector4_f32[3] = pSource->m[3][3];
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+
+ XMASSERT(pSource);
+
+ M.r[0] = _mm_load_ps( &pSource->_11 );
+ M.r[1] = _mm_load_ps( &pSource->_21 );
+ M.r[2] = _mm_load_ps( &pSource->_31 );
+ M.r[3] = _mm_load_ps( &pSource->_41 );
+
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+/****************************************************************************
+ *
+ * Vector and matrix store operations
+ *
+ ****************************************************************************/
+
+XMFINLINE VOID XMStoreInt
+(
+ UINT* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ *pDestination = XMVectorGetIntX( V );
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ _mm_store_ss( (float*)pDestination, V );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat
+(
+ FLOAT* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ *pDestination = XMVectorGetX( V );
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ _mm_store_ss( pDestination, V );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreInt2
+(
+ UINT* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ pDestination[0] = V.vector4_u32[0];
+ pDestination[1] = V.vector4_u32[1];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+ _mm_store_ss( (float*)&pDestination[0], V );
+ _mm_store_ss( (float*)&pDestination[1], T );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreSInt2
+(
+ XMINT2* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ pDestination->x = (INT)V.vector4_f32[0];
+ pDestination->y = (INT)V.vector4_f32[1];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ // In case of positive overflow, detect it
+ XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
+ // Float to int conversion
+ __m128i vResulti = _mm_cvttps_epi32(V);
+ // If there was positive overflow, set to 0x7FFFFFFF
+ XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
+ vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
+ vOverflow = _mm_or_ps(vOverflow,vResult);
+ // Write two ints
+ XMVECTOR T = _mm_shuffle_ps( vOverflow, vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+ _mm_store_ss( (float*)&pDestination->x, vOverflow );
+ _mm_store_ss( (float*)&pDestination->y, T );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUInt2
+(
+ XMUINT2* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ pDestination->x = (UINT)V.vector4_f32[0];
+ pDestination->y = (UINT)V.vector4_f32[1];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ // Clamp to >=0
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ // Any numbers that are too big, set to 0xFFFFFFFFU
+ XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
+ XMVECTOR vValue = g_XMUnsignedFix;
+ // Too large for a signed integer?
+ XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
+ // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
+ vValue = _mm_and_ps(vValue,vMask);
+ // Perform fixup only on numbers too large (Keeps low bit precision)
+ vResult = _mm_sub_ps(vResult,vValue);
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Convert from signed to unsigned pnly if greater than 0x80000000
+ vMask = _mm_and_ps(vMask,g_XMNegativeZero);
+ vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
+ // On those that are too large, set to 0xFFFFFFFF
+ vResult = _mm_or_ps(vResult,vOverflow);
+ // Write two uints
+ XMVECTOR T = _mm_shuffle_ps( vResult, vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+ _mm_store_ss( (float*)&pDestination->x, vResult );
+ _mm_store_ss( (float*)&pDestination->y, T );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreInt2A
+(
+ UINT* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+ pDestination[0] = V.vector4_u32[0];
+ pDestination[1] = V.vector4_u32[1];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+ _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat2
+(
+ XMFLOAT2* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ pDestination->x = V.vector4_f32[0];
+ pDestination->y = V.vector4_f32[1];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+ _mm_store_ss( &pDestination->x, V );
+ _mm_store_ss( &pDestination->y, T );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat2A
+(
+ XMFLOAT2A* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+ pDestination->x = V.vector4_f32[0];
+ pDestination->y = V.vector4_f32[1];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+ _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreHalf2
+(
+ XMHALF2* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+
+ pDestination->x = XMConvertFloatToHalf(V.vector4_f32[0]);
+ pDestination->y = XMConvertFloatToHalf(V.vector4_f32[1]);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
+ pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreShortN2
+(
+ XMSHORTN2* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+ N = XMVectorMultiply(N, Scale.v);
+ N = XMVectorRound(N);
+
+ pDestination->x = (SHORT)N.vector4_f32[0];
+ pDestination->y = (SHORT)N.vector4_f32[1];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
+
+ XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ vResult = _mm_mul_ps(vResult,Scale);
+ __m128i vResulti = _mm_cvtps_epi32(vResult);
+ vResulti = _mm_packs_epi32(vResulti,vResulti);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreShort2
+(
+ XMSHORT2* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
+ static CONST XMVECTOR Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, Min, Max);
+ N = XMVectorRound(N);
+
+ pDestination->x = (SHORT)N.vector4_f32[0];
+ pDestination->y = (SHORT)N.vector4_f32[1];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static CONST XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
+ static CONST XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
+ // Bounds check
+ XMVECTOR vResult = _mm_max_ps(V,Min);
+ vResult = _mm_min_ps(vResult,Max);
+ // Convert to int with rounding
+ __m128i vInt = _mm_cvtps_epi32(vResult);
+ // Pack the ints into shorts
+ vInt = _mm_packs_epi32(vInt,vInt);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),reinterpret_cast<const __m128 *>(&vInt)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUShortN2
+(
+ XMUSHORTN2* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorSaturate(V);
+ N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
+ N = XMVectorTruncate(N);
+
+ pDestination->x = (SHORT)N.vector4_f32[0];
+ pDestination->y = (SHORT)N.vector4_f32[1];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
+ // Bounds check
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ vResult = _mm_mul_ps(vResult,Scale);
+ // Convert to int with rounding
+ __m128i vInt = _mm_cvtps_epi32(vResult);
+ // Since the SSE pack instruction clamps using signed rules,
+ // manually extract the values to store them to memory
+ pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
+ pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUShort2
+(
+ XMUSHORT2* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, XMVectorZero(), Max);
+ N = XMVectorRound(N);
+
+ pDestination->x = (SHORT)N.vector4_f32[0];
+ pDestination->y = (SHORT)N.vector4_f32[1];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static CONST XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
+ // Bounds check
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,Max);
+ // Convert to int with rounding
+ __m128i vInt = _mm_cvtps_epi32(vResult);
+ // Since the SSE pack instruction clamps using signed rules,
+ // manually extract the values to store them to memory
+ pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
+ pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreByteN2
+(
+ XMBYTEN2* pDestination,
+ FXMVECTOR V
+)
+{
+ XMVECTOR N;
+ XMFLOAT4A tmp;
+ static CONST XMVECTORF32 Scale = {127.0f, 127.0f, 127.0f, 127.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+ N = XMVectorMultiply(N, Scale.v);
+ N = XMVectorRound(N);
+
+ XMStoreFloat4A( &tmp, N );
+
+ pDestination->x = (CHAR)tmp.x;
+ pDestination->y = (CHAR)tmp.y;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreByte2
+(
+ XMBYTE2* pDestination,
+ FXMVECTOR V
+)
+{
+ XMVECTOR N;
+ XMFLOAT4A tmp;
+ static CONST XMVECTOR Min = {-127.0f, -127.0f, -127.0f, -127.0f};
+ static CONST XMVECTOR Max = {127.0f, 127.0f, 127.0f, 127.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, Min, Max);
+ N = XMVectorRound(N);
+
+ XMStoreFloat4A( &tmp, N );
+
+ pDestination->x = (CHAR)tmp.x;
+ pDestination->y = (CHAR)tmp.y;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUByteN2
+(
+ XMUBYTEN2* pDestination,
+ FXMVECTOR V
+)
+{
+ XMVECTOR N;
+ XMFLOAT4A tmp;
+ static CONST XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorSaturate(V);
+ N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
+ N = XMVectorTruncate(N);
+
+ XMStoreFloat4A( &tmp, N );
+
+ pDestination->x = (BYTE)tmp.x;
+ pDestination->y = (BYTE)tmp.y;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUByte2
+(
+ XMUBYTE2* pDestination,
+ FXMVECTOR V
+)
+{
+ XMVECTOR N;
+ static CONST XMVECTOR Max = {255.0f, 255.0f, 255.0f, 255.0f};
+ XMFLOAT4A tmp;
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, XMVectorZero(), Max);
+ N = XMVectorRound(N);
+
+ XMStoreFloat4A( &tmp, N );
+
+ pDestination->x = (BYTE)tmp.x;
+ pDestination->y = (BYTE)tmp.y;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreInt3
+(
+ UINT* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ pDestination[0] = V.vector4_u32[0];
+ pDestination[1] = V.vector4_u32[1];
+ pDestination[2] = V.vector4_u32[2];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+ XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+ _mm_store_ss( (float*)pDestination, V );
+ _mm_store_ss( (float*)&pDestination[1], T1 );
+ _mm_store_ss( (float*)&pDestination[2], T2 );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreSInt3
+(
+ XMINT3* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ pDestination->x = (INT)V.vector4_f32[0];
+ pDestination->y = (INT)V.vector4_f32[1];
+ pDestination->z = (INT)V.vector4_f32[2];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ // In case of positive overflow, detect it
+ XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
+ // Float to int conversion
+ __m128i vResulti = _mm_cvttps_epi32(V);
+ // If there was positive overflow, set to 0x7FFFFFFF
+ XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
+ vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
+ vOverflow = _mm_or_ps(vOverflow,vResult);
+ // Write 3 uints
+ XMVECTOR T1 = _mm_shuffle_ps(vOverflow,vOverflow,_MM_SHUFFLE(1,1,1,1));
+ XMVECTOR T2 = _mm_shuffle_ps(vOverflow,vOverflow,_MM_SHUFFLE(2,2,2,2));
+ _mm_store_ss( (float*)&pDestination->x, vOverflow );
+ _mm_store_ss( (float*)&pDestination->y, T1 );
+ _mm_store_ss( (float*)&pDestination->z, T2 );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUInt3
+(
+ XMUINT3* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ pDestination->x = (UINT)V.vector4_f32[0];
+ pDestination->y = (UINT)V.vector4_f32[1];
+ pDestination->z = (UINT)V.vector4_f32[2];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ // Clamp to >=0
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ // Any numbers that are too big, set to 0xFFFFFFFFU
+ XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
+ XMVECTOR vValue = g_XMUnsignedFix;
+ // Too large for a signed integer?
+ XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
+ // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
+ vValue = _mm_and_ps(vValue,vMask);
+ // Perform fixup only on numbers too large (Keeps low bit precision)
+ vResult = _mm_sub_ps(vResult,vValue);
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Convert from signed to unsigned pnly if greater than 0x80000000
+ vMask = _mm_and_ps(vMask,g_XMNegativeZero);
+ vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
+ // On those that are too large, set to 0xFFFFFFFF
+ vResult = _mm_or_ps(vResult,vOverflow);
+ // Write 3 uints
+ XMVECTOR T1 = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(1,1,1,1));
+ XMVECTOR T2 = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,2,2));
+ _mm_store_ss( (float*)&pDestination->x, vResult );
+ _mm_store_ss( (float*)&pDestination->y, T1 );
+ _mm_store_ss( (float*)&pDestination->z, T2 );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreInt3A
+(
+ UINT* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+ pDestination[0] = V.vector4_u32[0];
+ pDestination[1] = V.vector4_u32[1];
+ pDestination[2] = V.vector4_u32[2];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+ XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+ _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
+ _mm_store_ss( (float*)&pDestination[2], T );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat3
+(
+ XMFLOAT3* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ pDestination->x = V.vector4_f32[0];
+ pDestination->y = V.vector4_f32[1];
+ pDestination->z = V.vector4_f32[2];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+ XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+ _mm_store_ss( &pDestination->x, V );
+ _mm_store_ss( &pDestination->y, T1 );
+ _mm_store_ss( &pDestination->z, T2 );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat3A
+(
+ XMFLOAT3A* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+ pDestination->x = V.vector4_f32[0];
+ pDestination->y = V.vector4_f32[1];
+ pDestination->z = V.vector4_f32[2];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+ XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+ _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
+ _mm_store_ss( &pDestination->z, T );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUHenDN3
+(
+ XMUHENDN3* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {2047.0f, 2047.0f, 1023.0f, 0.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorSaturate(V);
+ N = XMVectorMultiply(N, Scale.v);
+
+ pDestination->v = (((UINT)N.vector4_f32[2] & 0x3FF) << 22) |
+ (((UINT)N.vector4_f32[1] & 0x7FF) << 11) |
+ (((UINT)N.vector4_f32[0] & 0x7FF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 ScaleUHenDN3 = {2047.0f, 2047.0f*2048.0f,1023.0f*(2048.0f*2048.0f)/2.0f,1.0f};
+ static const XMVECTORI32 MaskUHenDN3 = {0x7FF,0x7FF<<11,0x3FF<<(22-1),0};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleUHenDN3);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskUHenDN3);
+ // Do a horizontal or of 3 entries
+ __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
+ // i = x|y
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ // Move Z to the x position
+ vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
+ // Add Z to itself to perform a single bit left shift
+ vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+ // i = x|y|z
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUHenD3
+(
+ XMUHEND3* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Max = {2047.0f, 2047.0f, 1023.0f, 0.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, XMVectorZero(), Max);
+
+ pDestination->v = (((UINT)N.vector4_f32[2] & 0x3FF) << 22) |
+ (((UINT)N.vector4_f32[1] & 0x7FF) << 11) |
+ (((UINT)N.vector4_f32[0] & 0x7FF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 MaxUHenD3 = { 2047.0f, 2047.0f, 1023.0f, 1.0f};
+ static const XMVECTORF32 ScaleUHenD3 = {1.0f, 2048.0f,(2048.0f*2048.0f)/2.0f,1.0f};
+ static const XMVECTORI32 MaskUHenD3 = {0x7FF,0x7FF<<11,0x3FF<<(22-1),0};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,MaxUHenD3);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleUHenD3);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskUHenD3);
+ // Do a horizontal or of 3 entries
+ __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
+ // i = x|y
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ // Move Z to the x position
+ vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
+ // Add Z to itself to perform a single bit left shift
+ vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+ // i = x|y|z
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreHenDN3
+(
+ XMHENDN3* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {1023.0f, 1023.0f, 511.0f, 1.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+ N = XMVectorMultiply(N, Scale.v);
+
+ pDestination->v = (((INT)N.vector4_f32[2] & 0x3FF) << 22) |
+ (((INT)N.vector4_f32[1] & 0x7FF) << 11) |
+ (((INT)N.vector4_f32[0] & 0x7FF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 ScaleHenDN3 = {1023.0f, 1023.0f*2048.0f,511.0f*(2048.0f*2048.0f),1.0f};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleHenDN3);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,g_XMMaskHenD3);
+ // Do a horizontal or of all 4 entries
+ vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
+ vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
+ vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreHenD3
+(
+ XMHEND3* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Min = {-1023.0f, -1023.0f, -511.0f, -1.0f};
+ static CONST XMVECTOR Max = {1023.0f, 1023.0f, 511.0f, 1.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, Min, Max);
+
+ pDestination->v = (((INT)N.vector4_f32[2] & 0x3FF) << 22) |
+ (((INT)N.vector4_f32[1] & 0x7FF) << 11) |
+ (((INT)N.vector4_f32[0] & 0x7FF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 MinHenD3 = {-1023.0f,-1023.0f,-511.0f,-1.0f};
+ static const XMVECTORF32 MaxHenD3 = { 1023.0f, 1023.0f, 511.0f, 1.0f};
+ static const XMVECTORF32 ScaleHenD3 = {1.0f, 2048.0f,(2048.0f*2048.0f),1.0f};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,MinHenD3);
+ vResult = _mm_min_ps(vResult,MaxHenD3);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleHenD3);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,g_XMMaskHenD3);
+ // Do a horizontal or of all 4 entries
+ vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
+ vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
+ vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUDHenN3
+(
+ XMUDHENN3* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {1023.0f, 2047.0f, 2047.0f, 0.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorSaturate(V);
+ N = XMVectorMultiply(N, Scale.v);
+
+ pDestination->v = (((UINT)N.vector4_f32[2] & 0x7FF) << 21) |
+ (((UINT)N.vector4_f32[1] & 0x7FF) << 10) |
+ (((UINT)N.vector4_f32[0] & 0x3FF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 ScaleUDHenN3 = {1023.0f,2047.0f*1024.0f,2047.0f*(1024.0f*2048.0f)/2.0f,1.0f};
+ static const XMVECTORI32 MaskUDHenN3 = {0x3FF,0x7FF<<10,0x7FF<<(21-1),0};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleUDHenN3);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskUDHenN3);
+ // Do a horizontal or of 3 entries
+ __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
+ // i = x|y
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ // Move Z to the x position
+ vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
+ // Add Z to itself to perform a single bit left shift
+ vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+ // i = x|y|z
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUDHen3
+(
+ XMUDHEN3* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Max = {1023.0f, 2047.0f, 2047.0f, 0.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, XMVectorZero(), Max);
+
+ pDestination->v = (((UINT)N.vector4_f32[2] & 0x7FF) << 21) |
+ (((UINT)N.vector4_f32[1] & 0x7FF) << 10) |
+ (((UINT)N.vector4_f32[0] & 0x3FF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 MaxUDHen3 = { 1023.0f, 2047.0f, 2047.0f, 1.0f};
+ static const XMVECTORF32 ScaleUDHen3 = {1.0f, 1024.0f,(1024.0f*2048.0f)/2.0f,1.0f};
+ static const XMVECTORI32 MaskUDHen3 = {0x3FF,0x7FF<<10,0x7FF<<(21-1),0};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,MaxUDHen3);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleUDHen3);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskUDHen3);
+ // Do a horizontal or of 3 entries
+ __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
+ // i = x|y
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ // Move Z to the x position
+ vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
+ // Add Z to itself to perform a single bit left shift
+ vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+ // i = x|y|z
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreDHenN3
+(
+ XMDHENN3* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {511.0f, 1023.0f, 1023.0f, 1.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+ N = XMVectorMultiply(N, Scale.v);
+
+ pDestination->v = (((INT)N.vector4_f32[2] & 0x7FF) << 21) |
+ (((INT)N.vector4_f32[1] & 0x7FF) << 10) |
+ (((INT)N.vector4_f32[0] & 0x3FF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 ScaleDHenN3 = {511.0f, 1023.0f*1024.0f,1023.0f*(1024.0f*2048.0f),1.0f};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleDHenN3);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,g_XMMaskDHen3);
+ // Do a horizontal or of all 4 entries
+ vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
+ vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
+ vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreDHen3
+(
+ XMDHEN3* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Min = {-511.0f, -1023.0f, -1023.0f, -1.0f};
+ static CONST XMVECTOR Max = {511.0f, 1023.0f, 1023.0f, 1.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, Min, Max);
+
+ pDestination->v = (((INT)N.vector4_f32[2] & 0x7FF) << 21) |
+ (((INT)N.vector4_f32[1] & 0x7FF) << 10) |
+ (((INT)N.vector4_f32[0] & 0x3FF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 MinDHen3 = {-511.0f,-1023.0f,-1023.0f,-1.0f};
+ static const XMVECTORF32 MaxDHen3 = { 511.0f, 1023.0f, 1023.0f, 1.0f};
+ static const XMVECTORF32 ScaleDHen3 = {1.0f, 1024.0f,(1024.0f*2048.0f),1.0f};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,MinDHen3);
+ vResult = _mm_min_ps(vResult,MaxDHen3);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleDHen3);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,g_XMMaskDHen3);
+ // Do a horizontal or of all 4 entries
+ vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
+ vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
+ vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreU565
+(
+ XMU565* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+ XMASSERT(pDestination);
+ static CONST XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f};
+ // Bounds check
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,Max);
+ // Convert to int with rounding
+ __m128i vInt = _mm_cvtps_epi32(vResult);
+ // No SSE operations will write to 16-bit values, so we have to extract them manually
+ USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
+ USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
+ USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
+ pDestination->v = ((z & 0x1F) << 11) |
+ ((y & 0x3F) << 5) |
+ ((x & 0x1F));
+#else
+ XMVECTOR N;
+ static CONST XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, XMVectorZero(), Max.v);
+ N = XMVectorRound(N);
+
+ pDestination->v = (((USHORT)N.vector4_f32[2] & 0x1F) << 11) |
+ (((USHORT)N.vector4_f32[1] & 0x3F) << 5) |
+ (((USHORT)N.vector4_f32[0] & 0x1F));
+#endif !_XM_SSE_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat3PK
+(
+ XMFLOAT3PK* pDestination,
+ FXMVECTOR V
+)
+{
+ _DECLSPEC_ALIGN_16_ UINT IValue[4];
+ UINT I, Sign, j;
+ UINT Result[3];
+
+ XMASSERT(pDestination);
+
+ XMStoreFloat3A( (XMFLOAT3A*)&IValue, V );
+
+ // X & Y Channels (5-bit exponent, 6-bit mantissa)
+ for(j=0; j < 2; ++j)
+ {
+ Sign = IValue[j] & 0x80000000;
+ I = IValue[j] & 0x7FFFFFFF;
+
+ if ((I & 0x7F800000) == 0x7F800000)
+ {
+ // INF or NAN
+ Result[j] = 0x7c0;
+ if (( I & 0x7FFFFF ) != 0)
+ {
+ Result[j] = 0x7c0 | (((I>>17)|(I>11)|(I>>6)|(I))&0x3f);
+ }
+ else if ( Sign )
+ {
+ // -INF is clamped to 0 since 3PK is positive only
+ Result[j] = 0;
+ }
+ }
+ else if ( Sign )
+ {
+ // 3PK is positive only, so clamp to zero
+ Result[j] = 0;
+ }
+ else if (I > 0x477E0000U)
+ {
+ // The number is too large to be represented as a float11, set to max
+ Result[j] = 0x7BF;
+ }
+ else
+ {
+ if (I < 0x38800000U)
+ {
+ // The number is too small to be represented as a normalized float11
+ // Convert it to a denormalized value.
+ UINT Shift = 113U - (I >> 23U);
+ I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
+ }
+ else
+ {
+ // Rebias the exponent to represent the value as a normalized float11
+ I += 0xC8000000U;
+ }
+
+ Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU;
+ }
+ }
+
+ // Z Channel (5-bit exponent, 5-bit mantissa)
+ Sign = IValue[2] & 0x80000000;
+ I = IValue[2] & 0x7FFFFFFF;
+
+ if ((I & 0x7F800000) == 0x7F800000)
+ {
+ // INF or NAN
+ Result[2] = 0x3e0;
+ if ( I & 0x7FFFFF )
+ {
+ Result[2] = 0x3e0 | (((I>>18)|(I>13)|(I>>3)|(I))&0x1f);
+ }
+ else if ( Sign )
+ {
+ // -INF is clamped to 0 since 3PK is positive only
+ Result[2] = 0;
+ }
+ }
+ else if ( Sign )
+ {
+ // 3PK is positive only, so clamp to zero
+ Result[2] = 0;
+ }
+ else if (I > 0x477C0000U)
+ {
+ // The number is too large to be represented as a float10, set to max
+ Result[2] = 0x3df;
+ }
+ else
+ {
+ if (I < 0x38800000U)
+ {
+ // The number is too small to be represented as a normalized float10
+ // Convert it to a denormalized value.
+ UINT Shift = 113U - (I >> 23U);
+ I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
+ }
+ else
+ {
+ // Rebias the exponent to represent the value as a normalized float10
+ I += 0xC8000000U;
+ }
+
+ Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU;
+ }
+
+ // Pack Result into memory
+ pDestination->v = (Result[0] & 0x7ff)
+ | ( (Result[1] & 0x7ff) << 11 )
+ | ( (Result[2] & 0x3ff) << 22 );
+}
+
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat3SE
+(
+ XMFLOAT3SE* pDestination,
+ FXMVECTOR V
+)
+{
+ _DECLSPEC_ALIGN_16_ UINT IValue[4];
+ UINT I, Sign, j, T;
+ UINT Frac[3];
+ UINT Exp[3];
+
+
+ XMASSERT(pDestination);
+
+ XMStoreFloat3A( (XMFLOAT3A*)&IValue, V );
+
+ // X, Y, Z Channels (5-bit exponent, 9-bit mantissa)
+ for(j=0; j < 3; ++j)
+ {
+ Sign = IValue[j] & 0x80000000;
+ I = IValue[j] & 0x7FFFFFFF;
+
+ if ((I & 0x7F800000) == 0x7F800000)
+ {
+ // INF or NAN
+ Exp[j] = 0x1f;
+ if (( I & 0x7FFFFF ) != 0)
+ {
+ Frac[j] = ((I>>14)|(I>5)|(I))&0x1ff;
+ }
+ else if ( Sign )
+ {
+ // -INF is clamped to 0 since 3SE is positive only
+ Exp[j] = Frac[j] = 0;
+ }
+ }
+ else if ( Sign )
+ {
+ // 3SE is positive only, so clamp to zero
+ Exp[j] = Frac[j] = 0;
+ }
+ else if (I > 0x477FC000U)
+ {
+ // The number is too large, set to max
+ Exp[j] = 0x1e;
+ Frac[j] = 0x1ff;
+ }
+ else
+ {
+ if (I < 0x38800000U)
+ {
+ // The number is too small to be represented as a normalized float11
+ // Convert it to a denormalized value.
+ UINT Shift = 113U - (I >> 23U);
+ I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
+ }
+ else
+ {
+ // Rebias the exponent to represent the value as a normalized float11
+ I += 0xC8000000U;
+ }
+
+ T = ((I + 0x1FFFU + ((I >> 14U) & 1U)) >> 14U)&0x3fffU;
+
+ Exp[j] = (T & 0x3E00) >> 9;
+ Frac[j] = T & 0x1ff;
+ }
+ }
+
+ // Adjust to a shared exponent
+ T = XMMax( Exp[0], XMMax( Exp[1], Exp[2] ) );
+
+ Frac[0] = Frac[0] >> (T - Exp[0]);
+ Frac[1] = Frac[1] >> (T - Exp[1]);
+ Frac[2] = Frac[2] >> (T - Exp[2]);
+
+ // Store packed into memory
+ pDestination->xm = Frac[0];
+ pDestination->ym = Frac[1];
+ pDestination->zm = Frac[2];
+ pDestination->e = T;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreInt4
+(
+ UINT* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+
+ pDestination[0] = V.vector4_u32[0];
+ pDestination[1] = V.vector4_u32[1];
+ pDestination[2] = V.vector4_u32[2];
+ pDestination[3] = V.vector4_u32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+
+ _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreInt4A
+(
+ UINT* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+ pDestination[0] = V.vector4_u32[0];
+ pDestination[1] = V.vector4_u32[1];
+ pDestination[2] = V.vector4_u32[2];
+ pDestination[3] = V.vector4_u32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+ _mm_store_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreSInt4
+(
+ XMINT4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+
+ pDestination->x = (INT)V.vector4_f32[0];
+ pDestination->y = (INT)V.vector4_f32[1];
+ pDestination->z = (INT)V.vector4_f32[2];
+ pDestination->w = (INT)V.vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+
+ // In case of positive overflow, detect it
+ XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
+ // Float to int conversion
+ __m128i vResulti = _mm_cvttps_epi32(V);
+ // If there was positive overflow, set to 0x7FFFFFFF
+ XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
+ vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
+ vOverflow = _mm_or_ps(vOverflow,vResult);
+ _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&vOverflow)[0] );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUInt4
+(
+ XMUINT4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+
+ pDestination->x = (UINT)V.vector4_f32[0];
+ pDestination->y = (UINT)V.vector4_f32[1];
+ pDestination->z = (UINT)V.vector4_f32[2];
+ pDestination->w = (UINT)V.vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+
+ // Clamp to >=0
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ // Any numbers that are too big, set to 0xFFFFFFFFU
+ XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
+ XMVECTOR vValue = g_XMUnsignedFix;
+ // Too large for a signed integer?
+ XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
+ // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
+ vValue = _mm_and_ps(vValue,vMask);
+ // Perform fixup only on numbers too large (Keeps low bit precision)
+ vResult = _mm_sub_ps(vResult,vValue);
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Convert from signed to unsigned pnly if greater than 0x80000000
+ vMask = _mm_and_ps(vMask,g_XMNegativeZero);
+ vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
+ // On those that are too large, set to 0xFFFFFFFF
+ vResult = _mm_or_ps(vResult,vOverflow);
+ _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&vResult)[0] );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreInt4NC
+(
+ UINT* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ pDestination[0] = V.vector4_u32[0];
+ pDestination[1] = V.vector4_u32[1];
+ pDestination[2] = V.vector4_u32[2];
+ pDestination[3] = V.vector4_u32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat4
+(
+ XMFLOAT4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+
+ pDestination->x = V.vector4_f32[0];
+ pDestination->y = V.vector4_f32[1];
+ pDestination->z = V.vector4_f32[2];
+ pDestination->w = V.vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+
+ _mm_storeu_ps( &pDestination->x, V );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat4A
+(
+ XMFLOAT4A* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+ pDestination->x = V.vector4_f32[0];
+ pDestination->y = V.vector4_f32[1];
+ pDestination->z = V.vector4_f32[2];
+ pDestination->w = V.vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+ _mm_store_ps( &pDestination->x, V );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat4NC
+(
+ XMFLOAT4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ pDestination->x = V.vector4_f32[0];
+ pDestination->y = V.vector4_f32[1];
+ pDestination->z = V.vector4_f32[2];
+ pDestination->w = V.vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 3) == 0);
+
+ _mm_storeu_ps( &pDestination->x, V );
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreHalf4
+(
+ XMHALF4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+
+ pDestination->x = XMConvertFloatToHalf(V.vector4_f32[0]);
+ pDestination->y = XMConvertFloatToHalf(V.vector4_f32[1]);
+ pDestination->z = XMConvertFloatToHalf(V.vector4_f32[2]);
+ pDestination->w = XMConvertFloatToHalf(V.vector4_f32[3]);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
+ pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
+ pDestination->z = XMConvertFloatToHalf(XMVectorGetZ(V));
+ pDestination->w = XMConvertFloatToHalf(XMVectorGetW(V));
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreShortN4
+(
+ XMSHORTN4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+ N = XMVectorMultiply(N, Scale.v);
+ N = XMVectorRound(N);
+
+ pDestination->x = (SHORT)N.vector4_f32[0];
+ pDestination->y = (SHORT)N.vector4_f32[1];
+ pDestination->z = (SHORT)N.vector4_f32[2];
+ pDestination->w = (SHORT)N.vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
+
+ XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ vResult = _mm_mul_ps(vResult,Scale);
+ __m128i vResulti = _mm_cvtps_epi32(vResult);
+ vResulti = _mm_packs_epi32(vResulti,vResulti);
+ _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),reinterpret_cast<const __m128d *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreShort4
+(
+ XMSHORT4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
+ static CONST XMVECTOR Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, Min, Max);
+ N = XMVectorRound(N);
+
+ pDestination->x = (SHORT)N.vector4_f32[0];
+ pDestination->y = (SHORT)N.vector4_f32[1];
+ pDestination->z = (SHORT)N.vector4_f32[2];
+ pDestination->w = (SHORT)N.vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static CONST XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
+ static CONST XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
+ // Bounds check
+ XMVECTOR vResult = _mm_max_ps(V,Min);
+ vResult = _mm_min_ps(vResult,Max);
+ // Convert to int with rounding
+ __m128i vInt = _mm_cvtps_epi32(vResult);
+ // Pack the ints into shorts
+ vInt = _mm_packs_epi32(vInt,vInt);
+ _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),reinterpret_cast<const __m128d *>(&vInt)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUShortN4
+(
+ XMUSHORTN4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorSaturate(V);
+ N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
+ N = XMVectorTruncate(N);
+
+ pDestination->x = (SHORT)N.vector4_f32[0];
+ pDestination->y = (SHORT)N.vector4_f32[1];
+ pDestination->z = (SHORT)N.vector4_f32[2];
+ pDestination->w = (SHORT)N.vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
+ // Bounds check
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ vResult = _mm_mul_ps(vResult,Scale);
+ // Convert to int with rounding
+ __m128i vInt = _mm_cvtps_epi32(vResult);
+ // Since the SSE pack instruction clamps using signed rules,
+ // manually extract the values to store them to memory
+ pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
+ pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
+ pDestination->z = static_cast<SHORT>(_mm_extract_epi16(vInt,4));
+ pDestination->w = static_cast<SHORT>(_mm_extract_epi16(vInt,6));
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUShort4
+(
+ XMUSHORT4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, XMVectorZero(), Max);
+ N = XMVectorRound(N);
+
+ pDestination->x = (SHORT)N.vector4_f32[0];
+ pDestination->y = (SHORT)N.vector4_f32[1];
+ pDestination->z = (SHORT)N.vector4_f32[2];
+ pDestination->w = (SHORT)N.vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static CONST XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
+ // Bounds check
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,Max);
+ // Convert to int with rounding
+ __m128i vInt = _mm_cvtps_epi32(vResult);
+ // Since the SSE pack instruction clamps using signed rules,
+ // manually extract the values to store them to memory
+ pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
+ pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
+ pDestination->z = static_cast<SHORT>(_mm_extract_epi16(vInt,4));
+ pDestination->w = static_cast<SHORT>(_mm_extract_epi16(vInt,6));
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreXIcoN4
+(
+ XMXICON4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
+ static CONST XMVECTORF32 Scale = {524287.0f, 524287.0f, 524287.0f, 15.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, Min.v, g_XMOne.v);
+ N = XMVectorMultiply(N, Scale.v);
+ N = XMVectorRound(N);
+
+ pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
+ (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
+ (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
+ (((INT64)N.vector4_f32[0] & 0xFFFFF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ // Note: Masks are x,w,y and z
+ static const XMVECTORF32 MinXIcoN4 = {-1.0f, 0.0f,-1.0f,-1.0f};
+ static const XMVECTORF32 ScaleXIcoN4 = {524287.0f,15.0f*4096.0f*65536.0f*0.5f,524287.0f*4096.0f,524287.0f};
+ static const XMVECTORI32 MaskXIcoN4 = {0xFFFFF,0xF<<((60-32)-1),0xFFFFF000,0xFFFFF};
+
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
+ vResult = _mm_max_ps(vResult,MinXIcoN4);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleXIcoN4);
+ // Convert to integer (w is unsigned)
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off unused bits
+ vResulti = _mm_and_si128(vResulti,MaskXIcoN4);
+ // Isolate Y
+ __m128i vResulti2 = _mm_and_si128(vResulti,g_XMMaskY);
+ // Double Y (Really W) to fixup for unsigned conversion
+ vResulti = _mm_add_epi32(vResulti,vResulti2);
+ // Shift y and z to straddle the 32-bit boundary
+ vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
+ // Shift it into place
+ vResulti2 = _mm_slli_si128(vResulti2,20/8);
+ // i = x|y<<20|z<<40|w<<60
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreXIco4
+(
+ XMXICO4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Min = {-524287.0f, -524287.0f, -524287.0f, 0.0f};
+ static CONST XMVECTORF32 Max = {524287.0f, 524287.0f, 524287.0f, 15.0f};
+
+ XMASSERT(pDestination);
+ N = XMVectorClamp(V, Min.v, Max.v);
+ pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
+ (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
+ (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
+ (((INT64)N.vector4_f32[0] & 0xFFFFF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ // Note: Masks are x,w,y and z
+ static const XMVECTORF32 MinXIco4 = {-524287.0f, 0.0f,-524287.0f,-524287.0f};
+ static const XMVECTORF32 MaxXIco4 = { 524287.0f,15.0f, 524287.0f, 524287.0f};
+ static const XMVECTORF32 ScaleXIco4 = {1.0f,4096.0f*65536.0f*0.5f,4096.0f,1.0f};
+ static const XMVECTORI32 MaskXIco4 = {0xFFFFF,0xF<<((60-1)-32),0xFFFFF000,0xFFFFF};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
+ vResult = _mm_max_ps(vResult,MinXIco4);
+ vResult = _mm_min_ps(vResult,MaxXIco4);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleXIco4);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskXIco4);
+ // Isolate Y
+ __m128i vResulti2 = _mm_and_si128(vResulti,g_XMMaskY);
+ // Double Y (Really W) to fixup for unsigned conversion
+ vResulti = _mm_add_epi32(vResulti,vResulti2);
+ // Shift y and z to straddle the 32-bit boundary
+ vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
+ // Shift it into place
+ vResulti2 = _mm_slli_si128(vResulti2,20/8);
+ // i = x|y<<20|z<<40|w<<60
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUIcoN4
+(
+ XMUICON4* pDestination,
+ FXMVECTOR V
+)
+{
+ #define XM_URange ((FLOAT)(1 << 20))
+ #define XM_URangeDiv2 ((FLOAT)(1 << 19))
+ #define XM_UMaxXYZ ((FLOAT)((1 << 20) - 1))
+ #define XM_UMaxW ((FLOAT)((1 << 4) - 1))
+ #define XM_ScaleXYZ (-(FLOAT)((1 << 20) - 1) / XM_PACK_FACTOR)
+ #define XM_ScaleW (-(FLOAT)((1 << 4) - 1) / XM_PACK_FACTOR)
+ #define XM_Scale (-1.0f / XM_PACK_FACTOR)
+ #define XM_Offset (3.0f)
+
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {1048575.0f, 1048575.0f, 1048575.0f, 15.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorSaturate(V);
+ N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
+
+ pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
+ (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
+ (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
+ (((UINT64)N.vector4_f32[0] & 0xFFFFF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ // Note: Masks are x,w,y and z
+ static const XMVECTORF32 ScaleUIcoN4 = {1048575.0f,15.0f*4096.0f*65536.0f,1048575.0f*4096.0f,1048575.0f};
+ static const XMVECTORI32 MaskUIcoN4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
+ static const XMVECTORF32 AddUIcoN4 = {0.0f,-32768.0f*65536.0f,-32768.0f*65536.0f,0.0f};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
+ vResult = _mm_max_ps(vResult,g_XMZero);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleUIcoN4);
+ // Adjust for unsigned entries
+ vResult = _mm_add_ps(vResult,AddUIcoN4);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Fix the signs on the unsigned entries
+ vResulti = _mm_xor_si128(vResulti,g_XMFlipYZ);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskUIcoN4);
+ // Shift y and z to straddle the 32-bit boundary
+ __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
+ // Shift it into place
+ vResulti2 = _mm_slli_si128(vResulti2,20/8);
+ // i = x|y<<20|z<<40|w<<60
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+
+ #undef XM_URange
+ #undef XM_URangeDiv2
+ #undef XM_UMaxXYZ
+ #undef XM_UMaxW
+ #undef XM_ScaleXYZ
+ #undef XM_ScaleW
+ #undef XM_Scale
+ #undef XM_Offset
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUIco4
+(
+ XMUICO4* pDestination,
+ FXMVECTOR V
+)
+{
+ #define XM_Scale (-1.0f / XM_PACK_FACTOR)
+ #define XM_URange ((FLOAT)(1 << 20))
+ #define XM_URangeDiv2 ((FLOAT)(1 << 19))
+
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Max = {1048575.0f, 1048575.0f, 1048575.0f, 15.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, XMVectorZero(), Max);
+ N = XMVectorRound(N);
+
+ pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
+ (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
+ (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
+ (((UINT64)N.vector4_f32[0] & 0xFFFFF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ // Note: Masks are x,w,y and z
+ static const XMVECTORF32 MaxUIco4 = { 1048575.0f, 15.0f, 1048575.0f, 1048575.0f};
+ static const XMVECTORF32 ScaleUIco4 = {1.0f,4096.0f*65536.0f,4096.0f,1.0f};
+ static const XMVECTORI32 MaskUIco4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
+ static const XMVECTORF32 AddUIco4 = {0.0f,-32768.0f*65536.0f,-32768.0f*65536.0f,0.0f};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
+ vResult = _mm_max_ps(vResult,g_XMZero);
+ vResult = _mm_min_ps(vResult,MaxUIco4);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleUIco4);
+ vResult = _mm_add_ps(vResult,AddUIco4);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ vResulti = _mm_xor_si128(vResulti,g_XMFlipYZ);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskUIco4);
+ // Shift y and z to straddle the 32-bit boundary
+ __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
+ // Shift it into place
+ vResulti2 = _mm_slli_si128(vResulti2,20/8);
+ // i = x|y<<20|z<<40|w<<60
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+
+ #undef XM_Scale
+ #undef XM_URange
+ #undef XM_URangeDiv2
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreIcoN4
+(
+ XMICON4* pDestination,
+ FXMVECTOR V
+)
+{
+ #define XM_Scale (-1.0f / XM_PACK_FACTOR)
+ #define XM_URange ((FLOAT)(1 << 4))
+ #define XM_Offset (3.0f)
+ #define XM_UMaxXYZ ((FLOAT)((1 << (20 - 1)) - 1))
+ #define XM_UMaxW ((FLOAT)((1 << (4 - 1)) - 1))
+
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {524287.0f, 524287.0f, 524287.0f, 7.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+ N = XMVectorMultiplyAdd(N, Scale.v, g_XMNegativeZero.v);
+ N = XMVectorRound(N);
+
+ pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
+ (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
+ (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
+ (((UINT64)N.vector4_f32[0] & 0xFFFFF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ // Note: Masks are x,w,y and z
+ static const XMVECTORF32 ScaleIcoN4 = {524287.0f,7.0f*4096.0f*65536.0f,524287.0f*4096.0f,524287.0f};
+ static const XMVECTORI32 MaskIcoN4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
+ vResult = _mm_max_ps(vResult,g_XMNegativeOne);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleIcoN4);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskIcoN4);
+ // Shift y and z to straddle the 32-bit boundary
+ __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
+ // Shift it into place
+ vResulti2 = _mm_slli_si128(vResulti2,20/8);
+ // i = x|y<<20|z<<40|w<<60
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+
+ #undef XM_Scale
+ #undef XM_URange
+ #undef XM_Offset
+ #undef XM_UMaxXYZ
+ #undef XM_UMaxW
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreIco4
+(
+ XMICO4* pDestination,
+ FXMVECTOR V
+)
+{
+ #define XM_Scale (-1.0f / XM_PACK_FACTOR)
+ #define XM_URange ((FLOAT)(1 << 4))
+ #define XM_Offset (3.0f)
+
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Min = {-524287.0f, -524287.0f, -524287.0f, -7.0f};
+ static CONST XMVECTOR Max = {524287.0f, 524287.0f, 524287.0f, 7.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, Min, Max);
+ N = XMVectorRound(N);
+
+ pDestination->v = ((INT64)N.vector4_f32[3] << 60) |
+ (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
+ (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
+ (((INT64)N.vector4_f32[0] & 0xFFFFF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ // Note: Masks are x,w,y and z
+ static const XMVECTORF32 MinIco4 = {-524287.0f,-7.0f,-524287.0f,-524287.0f};
+ static const XMVECTORF32 MaxIco4 = { 524287.0f, 7.0f, 524287.0f, 524287.0f};
+ static const XMVECTORF32 ScaleIco4 = {1.0f,4096.0f*65536.0f,4096.0f,1.0f};
+ static const XMVECTORI32 MaskIco4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
+ vResult = _mm_max_ps(vResult,MinIco4);
+ vResult = _mm_min_ps(vResult,MaxIco4);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleIco4);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskIco4);
+ // Shift y and z to straddle the 32-bit boundary
+ __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
+ // Shift it into place
+ vResulti2 = _mm_slli_si128(vResulti2,20/8);
+ // i = x|y<<20|z<<40|w<<60
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+
+ #undef XM_Scale
+ #undef XM_URange
+ #undef XM_Offset
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreXDecN4
+(
+ XMXDECN4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
+ static CONST XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 3.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, Min.v, g_XMOne.v);
+ N = XMVectorMultiply(N, Scale.v);
+ N = XMVectorRound(N);
+
+ pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
+ (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
+ (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
+ (((INT)N.vector4_f32[0] & 0x3FF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
+ static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f};
+ static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29};
+ XMASSERT(pDestination);
+ XMVECTOR vResult = _mm_max_ps(V,Min);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,Scale);
+ // Convert to int (W is unsigned)
+ __m128i vResulti = _mm_cvtps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,ScaleMask);
+ // To fix W, add itself to shift it up to <<30 instead of <<29
+ __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW);
+ vResulti = _mm_add_epi32(vResulti,vResultw);
+ // Do a horizontal or of all 4 entries
+ vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
+ vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
+ vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
+ vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreXDec4
+(
+ XMXDEC4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Min = {-511.0f, -511.0f, -511.0f, 0.0f};
+ static CONST XMVECTOR Max = {511.0f, 511.0f, 511.0f, 3.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, Min, Max);
+
+ pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
+ (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
+ (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
+ (((INT)N.vector4_f32[0] & 0x3FF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f};
+ static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f};
+ static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
+ static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,MinXDec4);
+ vResult = _mm_min_ps(vResult,MaxXDec4);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleXDec4);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskXDec4);
+ // Do a horizontal or of 4 entries
+ __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+ // x = x|z, y = y|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ // Move Z to the x position
+ vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+ // Perform a single bit left shift on y|w
+ vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+ // i = x|y|z|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUDecN4
+(
+ XMUDECN4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {1023.0f, 1023.0f, 1023.0f, 3.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorSaturate(V);
+ N = XMVectorMultiply(N, Scale.v);
+
+ pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
+ (((UINT)N.vector4_f32[2] & 0x3FF) << 20) |
+ (((UINT)N.vector4_f32[1] & 0x3FF) << 10) |
+ (((UINT)N.vector4_f32[0] & 0x3FF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f};
+ static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleUDecN4);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskUDecN4);
+ // Do a horizontal or of 4 entries
+ __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+ // x = x|z, y = y|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ // Move Z to the x position
+ vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+ // Perform a left shift by one bit on y|w
+ vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+ // i = x|y|z|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUDec4
+(
+ XMUDEC4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Max = {1023.0f, 1023.0f, 1023.0f, 3.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, XMVectorZero(), Max);
+
+ pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
+ (((UINT)N.vector4_f32[2] & 0x3FF) << 20) |
+ (((UINT)N.vector4_f32[1] & 0x3FF) << 10) |
+ (((UINT)N.vector4_f32[0] & 0x3FF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f};
+ static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
+ static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,MaxUDec4);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleUDec4);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskUDec4);
+ // Do a horizontal or of 4 entries
+ __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+ // x = x|z, y = y|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ // Move Z to the x position
+ vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+ // Perform a left shift by one bit on y|w
+ vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+ // i = x|y|z|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreDecN4
+(
+ XMDECN4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 1.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+ N = XMVectorMultiply(N, Scale.v);
+
+ pDestination->v = ((INT)N.vector4_f32[3] << 30) |
+ (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
+ (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
+ (((INT)N.vector4_f32[0] & 0x3FF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f};
+ static const XMVECTORI32 MaskDecN4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleDecN4);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskDecN4);
+ // Do a horizontal or of 4 entries
+ __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+ // x = x|z, y = y|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ // Move Z to the x position
+ vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+ // i = x|y|z|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreDec4
+(
+ XMDEC4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Min = {-511.0f, -511.0f, -511.0f, -1.0f};
+ static CONST XMVECTOR Max = {511.0f, 511.0f, 511.0f, 1.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, Min, Max);
+
+ pDestination->v = ((INT)N.vector4_f32[3] << 30) |
+ (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
+ (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
+ (((INT)N.vector4_f32[0] & 0x3FF));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f};
+ static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f};
+ static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f};
+ static const XMVECTORI32 MaskDec4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,MinDec4);
+ vResult = _mm_min_ps(vResult,MaxDec4);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleDec4);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskDec4);
+ // Do a horizontal or of 4 entries
+ __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+ // x = x|z, y = y|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ // Move Z to the x position
+ vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+ // i = x|y|z|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUByteN4
+(
+ XMUBYTEN4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorSaturate(V);
+ N = XMVectorMultiply(N, Scale.v);
+ N = XMVectorRound(N);
+
+ pDestination->x = (BYTE)N.vector4_f32[0];
+ pDestination->y = (BYTE)N.vector4_f32[1];
+ pDestination->z = (BYTE)N.vector4_f32[2];
+ pDestination->w = (BYTE)N.vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 ScaleUByteN4 = {255.0f,255.0f*256.0f*0.5f,255.0f*256.0f*256.0f,255.0f*256.0f*256.0f*256.0f*0.5f};
+ static const XMVECTORI32 MaskUByteN4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleUByteN4);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskUByteN4);
+ // Do a horizontal or of 4 entries
+ __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+ // x = x|z, y = y|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ // Move Z to the x position
+ vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+ // Perform a single bit left shift to fix y|w
+ vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+ // i = x|y|z|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUByte4
+(
+ XMUBYTE4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Max = {255.0f, 255.0f, 255.0f, 255.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, XMVectorZero(), Max);
+ N = XMVectorRound(N);
+
+ pDestination->x = (BYTE)N.vector4_f32[0];
+ pDestination->y = (BYTE)N.vector4_f32[1];
+ pDestination->z = (BYTE)N.vector4_f32[2];
+ pDestination->w = (BYTE)N.vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 MaxUByte4 = { 255.0f, 255.0f, 255.0f, 255.0f};
+ static const XMVECTORF32 ScaleUByte4 = {1.0f,256.0f*0.5f,256.0f*256.0f,256.0f*256.0f*256.0f*0.5f};
+ static const XMVECTORI32 MaskUByte4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,MaxUByte4);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleUByte4);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskUByte4);
+ // Do a horizontal or of 4 entries
+ __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+ // x = x|z, y = y|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ // Move Z to the x position
+ vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+ // Perform a single bit left shift to fix y|w
+ vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
+ // i = x|y|z|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreByteN4
+(
+ XMBYTEN4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {127.0f, 127.0f, 127.0f, 127.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+ N = XMVectorMultiply(V, Scale.v);
+ N = XMVectorRound(N);
+
+ pDestination->x = (CHAR)N.vector4_f32[0];
+ pDestination->y = (CHAR)N.vector4_f32[1];
+ pDestination->z = (CHAR)N.vector4_f32[2];
+ pDestination->w = (CHAR)N.vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 ScaleByteN4 = {127.0f,127.0f*256.0f,127.0f*256.0f*256.0f,127.0f*256.0f*256.0f*256.0f};
+ static const XMVECTORI32 MaskByteN4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleByteN4);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskByteN4);
+ // Do a horizontal or of 4 entries
+ __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+ // x = x|z, y = y|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ // Move Z to the x position
+ vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+ // i = x|y|z|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreByte4
+(
+ XMBYTE4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTOR Min = {-127.0f, -127.0f, -127.0f, -127.0f};
+ static CONST XMVECTOR Max = {127.0f, 127.0f, 127.0f, 127.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, Min, Max);
+ N = XMVectorRound(N);
+
+ pDestination->x = (CHAR)N.vector4_f32[0];
+ pDestination->y = (CHAR)N.vector4_f32[1];
+ pDestination->z = (CHAR)N.vector4_f32[2];
+ pDestination->w = (CHAR)N.vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static const XMVECTORF32 MinByte4 = {-127.0f,-127.0f,-127.0f,-127.0f};
+ static const XMVECTORF32 MaxByte4 = { 127.0f, 127.0f, 127.0f, 127.0f};
+ static const XMVECTORF32 ScaleByte4 = {1.0f,256.0f,256.0f*256.0f,256.0f*256.0f*256.0f};
+ static const XMVECTORI32 MaskByte4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
+ // Clamp to bounds
+ XMVECTOR vResult = _mm_max_ps(V,MinByte4);
+ vResult = _mm_min_ps(vResult,MaxByte4);
+ // Scale by multiplication
+ vResult = _mm_mul_ps(vResult,ScaleByte4);
+ // Convert to int
+ __m128i vResulti = _mm_cvttps_epi32(vResult);
+ // Mask off any fraction
+ vResulti = _mm_and_si128(vResulti,MaskByte4);
+ // Do a horizontal or of 4 entries
+ __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
+ // x = x|z, y = y|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ // Move Z to the x position
+ vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
+ // i = x|y|z|w
+ vResulti = _mm_or_si128(vResulti,vResulti2);
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreUNibble4
+(
+ XMUNIBBLE4* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+ XMASSERT(pDestination);
+ static CONST XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f};
+ // Bounds check
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,Max);
+ // Convert to int with rounding
+ __m128i vInt = _mm_cvtps_epi32(vResult);
+ // No SSE operations will write to 16-bit values, so we have to extract them manually
+ USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
+ USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
+ USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
+ USHORT w = static_cast<USHORT>(_mm_extract_epi16(vInt,6));
+ pDestination->v = ((w & 0xF) << 12) |
+ ((z & 0xF) << 8) |
+ ((y & 0xF) << 4) |
+ ((x & 0xF));
+#else
+ XMVECTOR N;
+ static CONST XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, XMVectorZero(), Max.v);
+ N = XMVectorRound(N);
+
+ pDestination->v = (((USHORT)N.vector4_f32[3] & 0xF) << 12) |
+ (((USHORT)N.vector4_f32[2] & 0xF) << 8) |
+ (((USHORT)N.vector4_f32[1] & 0xF) << 4) |
+ (((USHORT)N.vector4_f32[0] & 0xF));
+#endif !_XM_SSE_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreU555(
+ XMU555* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+ XMASSERT(pDestination);
+ static CONST XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f};
+ // Bounds check
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ vResult = _mm_min_ps(vResult,Max);
+ // Convert to int with rounding
+ __m128i vInt = _mm_cvtps_epi32(vResult);
+ // No SSE operations will write to 16-bit values, so we have to extract them manually
+ USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
+ USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
+ USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
+ USHORT w = static_cast<USHORT>(_mm_extract_epi16(vInt,6));
+ pDestination->v = ((w) ? 0x8000 : 0) |
+ ((z & 0x1F) << 10) |
+ ((y & 0x1F) << 5) |
+ ((x & 0x1F));
+#else
+ XMVECTOR N;
+ static CONST XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorClamp(V, XMVectorZero(), Max.v);
+ N = XMVectorRound(N);
+
+ pDestination->v = ((N.vector4_f32[3] > 0.f) ? 0x8000 : 0) |
+ (((USHORT)N.vector4_f32[2] & 0x1F) << 10) |
+ (((USHORT)N.vector4_f32[1] & 0x1F) << 5) |
+ (((USHORT)N.vector4_f32[0] & 0x1F));
+#endif !_XM_SSE_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreColor
+(
+ XMCOLOR* pDestination,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ static CONST XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f};
+
+ XMASSERT(pDestination);
+
+ N = XMVectorSaturate(V);
+ N = XMVectorMultiply(N, Scale.v);
+ N = XMVectorRound(N);
+
+ pDestination->c = ((UINT)N.vector4_f32[3] << 24) |
+ ((UINT)N.vector4_f32[0] << 16) |
+ ((UINT)N.vector4_f32[1] << 8) |
+ ((UINT)N.vector4_f32[2]);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ static CONST XMVECTORF32 Scale = {255.0f,255.0f,255.0f,255.0f};
+ // Set <0 to 0
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ // Set>1 to 1
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ // Convert to 0-255
+ vResult = _mm_mul_ps(vResult,Scale);
+ // Shuffle RGBA to ARGB
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2));
+ // Convert to int
+ __m128i vInt = _mm_cvtps_epi32(vResult);
+ // Mash to shorts
+ vInt = _mm_packs_epi32(vInt,vInt);
+ // Mash to bytes
+ vInt = _mm_packus_epi16(vInt,vInt);
+ // Store the color
+ _mm_store_ss(reinterpret_cast<float *>(&pDestination->c),reinterpret_cast<__m128 *>(&vInt)[0]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat3x3
+(
+ XMFLOAT3X3* pDestination,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_)
+
+ XMStoreFloat3x3NC(pDestination, M);
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat3x3NC
+(
+ XMFLOAT3X3* pDestination,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+
+ pDestination->m[0][0] = M.r[0].vector4_f32[0];
+ pDestination->m[0][1] = M.r[0].vector4_f32[1];
+ pDestination->m[0][2] = M.r[0].vector4_f32[2];
+
+ pDestination->m[1][0] = M.r[1].vector4_f32[0];
+ pDestination->m[1][1] = M.r[1].vector4_f32[1];
+ pDestination->m[1][2] = M.r[1].vector4_f32[2];
+
+ pDestination->m[2][0] = M.r[2].vector4_f32[0];
+ pDestination->m[2][1] = M.r[2].vector4_f32[1];
+ pDestination->m[2][2] = M.r[2].vector4_f32[2];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ XMVECTOR vTemp1 = M.r[0];
+ XMVECTOR vTemp2 = M.r[1];
+ XMVECTOR vTemp3 = M.r[2];
+ XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2));
+ vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0));
+ _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
+ vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
+ _mm_storeu_ps(&pDestination->m[1][1],vTemp2);
+ vTemp3 = _mm_shuffle_ps(vTemp3,vTemp3,_MM_SHUFFLE(2,2,2,2));
+ _mm_store_ss(&pDestination->m[2][2],vTemp3);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat4x3
+(
+ XMFLOAT4X3* pDestination,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_)
+
+ XMStoreFloat4x3NC(pDestination, M);
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat4x3A
+(
+ XMFLOAT4X3A* pDestination,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+ pDestination->m[0][0] = M.r[0].vector4_f32[0];
+ pDestination->m[0][1] = M.r[0].vector4_f32[1];
+ pDestination->m[0][2] = M.r[0].vector4_f32[2];
+
+ pDestination->m[1][0] = M.r[1].vector4_f32[0];
+ pDestination->m[1][1] = M.r[1].vector4_f32[1];
+ pDestination->m[1][2] = M.r[1].vector4_f32[2];
+
+ pDestination->m[2][0] = M.r[2].vector4_f32[0];
+ pDestination->m[2][1] = M.r[2].vector4_f32[1];
+ pDestination->m[2][2] = M.r[2].vector4_f32[2];
+
+ pDestination->m[3][0] = M.r[3].vector4_f32[0];
+ pDestination->m[3][1] = M.r[3].vector4_f32[1];
+ pDestination->m[3][2] = M.r[3].vector4_f32[2];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+ // x1,y1,z1,w1
+ XMVECTOR vTemp1 = M.r[0];
+ // x2,y2,z2,w2
+ XMVECTOR vTemp2 = M.r[1];
+ // x3,y3,z3,w3
+ XMVECTOR vTemp3 = M.r[2];
+ // x4,y4,z4,w4
+ XMVECTOR vTemp4 = M.r[3];
+ // z1,z1,x2,y2
+ XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2));
+ // y2,z2,x3,y3 (Final)
+ vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
+ // x1,y1,z1,x2 (Final)
+ vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0));
+ // z3,z3,x4,x4
+ vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
+ // z3,x4,y4,z4 (Final)
+ vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
+ // Store in 3 operations
+ _mm_store_ps(&pDestination->m[0][0],vTemp1);
+ _mm_store_ps(&pDestination->m[1][1],vTemp2);
+ _mm_store_ps(&pDestination->m[2][2],vTemp3);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat4x3NC
+(
+ XMFLOAT4X3* pDestination,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+
+ pDestination->m[0][0] = M.r[0].vector4_f32[0];
+ pDestination->m[0][1] = M.r[0].vector4_f32[1];
+ pDestination->m[0][2] = M.r[0].vector4_f32[2];
+
+ pDestination->m[1][0] = M.r[1].vector4_f32[0];
+ pDestination->m[1][1] = M.r[1].vector4_f32[1];
+ pDestination->m[1][2] = M.r[1].vector4_f32[2];
+
+ pDestination->m[2][0] = M.r[2].vector4_f32[0];
+ pDestination->m[2][1] = M.r[2].vector4_f32[1];
+ pDestination->m[2][2] = M.r[2].vector4_f32[2];
+
+ pDestination->m[3][0] = M.r[3].vector4_f32[0];
+ pDestination->m[3][1] = M.r[3].vector4_f32[1];
+ pDestination->m[3][2] = M.r[3].vector4_f32[2];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ XMVECTOR vTemp1 = M.r[0];
+ XMVECTOR vTemp2 = M.r[1];
+ XMVECTOR vTemp3 = M.r[2];
+ XMVECTOR vTemp4 = M.r[3];
+ XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
+ vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0));
+ vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0));
+ vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
+ vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
+ _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
+ _mm_storeu_ps(&pDestination->m[1][1],vTemp2x);
+ _mm_storeu_ps(&pDestination->m[2][2],vTemp3);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat4x4
+(
+ XMFLOAT4X4* pDestination,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+
+ XMStoreFloat4x4NC(pDestination, M);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+
+ _mm_storeu_ps( &pDestination->_11, M.r[0] );
+ _mm_storeu_ps( &pDestination->_21, M.r[1] );
+ _mm_storeu_ps( &pDestination->_31, M.r[2] );
+ _mm_storeu_ps( &pDestination->_41, M.r[3] );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat4x4A
+(
+ XMFLOAT4X4A* pDestination,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+ XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
+
+ pDestination->m[0][0] = M.r[0].vector4_f32[0];
+ pDestination->m[0][1] = M.r[0].vector4_f32[1];
+ pDestination->m[0][2] = M.r[0].vector4_f32[2];
+ pDestination->m[0][3] = M.r[0].vector4_f32[3];
+
+ pDestination->m[1][0] = M.r[1].vector4_f32[0];
+ pDestination->m[1][1] = M.r[1].vector4_f32[1];
+ pDestination->m[1][2] = M.r[1].vector4_f32[2];
+ pDestination->m[1][3] = M.r[1].vector4_f32[3];
+
+ pDestination->m[2][0] = M.r[2].vector4_f32[0];
+ pDestination->m[2][1] = M.r[2].vector4_f32[1];
+ pDestination->m[2][2] = M.r[2].vector4_f32[2];
+ pDestination->m[2][3] = M.r[2].vector4_f32[3];
+
+ pDestination->m[3][0] = M.r[3].vector4_f32[0];
+ pDestination->m[3][1] = M.r[3].vector4_f32[1];
+ pDestination->m[3][2] = M.r[3].vector4_f32[2];
+ pDestination->m[3][3] = M.r[3].vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+
+ _mm_store_ps( &pDestination->_11, M.r[0] );
+ _mm_store_ps( &pDestination->_21, M.r[1] );
+ _mm_store_ps( &pDestination->_31, M.r[2] );
+ _mm_store_ps( &pDestination->_41, M.r[3] );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMStoreFloat4x4NC
+(
+ XMFLOAT4X4* pDestination,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMASSERT(pDestination);
+
+ pDestination->m[0][0] = M.r[0].vector4_f32[0];
+ pDestination->m[0][1] = M.r[0].vector4_f32[1];
+ pDestination->m[0][2] = M.r[0].vector4_f32[2];
+ pDestination->m[0][3] = M.r[0].vector4_f32[3];
+
+ pDestination->m[1][0] = M.r[1].vector4_f32[0];
+ pDestination->m[1][1] = M.r[1].vector4_f32[1];
+ pDestination->m[1][2] = M.r[1].vector4_f32[2];
+ pDestination->m[1][3] = M.r[1].vector4_f32[3];
+
+ pDestination->m[2][0] = M.r[2].vector4_f32[0];
+ pDestination->m[2][1] = M.r[2].vector4_f32[1];
+ pDestination->m[2][2] = M.r[2].vector4_f32[2];
+ pDestination->m[2][3] = M.r[2].vector4_f32[3];
+
+ pDestination->m[3][0] = M.r[3].vector4_f32[0];
+ pDestination->m[3][1] = M.r[3].vector4_f32[1];
+ pDestination->m[3][2] = M.r[3].vector4_f32[2];
+ pDestination->m[3][3] = M.r[3].vector4_f32[3];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pDestination);
+ _mm_storeu_ps(&pDestination->m[0][0],M.r[0]);
+ _mm_storeu_ps(&pDestination->m[1][0],M.r[1]);
+ _mm_storeu_ps(&pDestination->m[2][0],M.r[2]);
+ _mm_storeu_ps(&pDestination->m[3][0],M.r[3]);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+#endif // __XNAMATHCONVERT_INL__
+
diff --git a/thirdparty/directxtex/XNAMath/xnamathmatrix.inl b/thirdparty/directxtex/XNAMath/xnamathmatrix.inl
new file mode 100644
index 00000000..eb9f1644
--- /dev/null
+++ b/thirdparty/directxtex/XNAMath/xnamathmatrix.inl
@@ -0,0 +1,3293 @@
+/************************************************************************
+* *
+* xnamathmatrix.inl -- SIMD C++ Math library for Windows and Xbox 360 *
+* Matrix functions *
+* *
+* Copyright (c) Microsoft Corp. All rights reserved. *
+* *
+************************************************************************/
+
+#if defined(_MSC_VER) && (_MSC_VER > 1000)
+#pragma once
+#endif
+
+#ifndef __XNAMATHMATRIX_INL__
+#define __XNAMATHMATRIX_INL__
+
+/****************************************************************************
+ *
+ * Matrix
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+// Return TRUE if any entry in the matrix is NaN
+XMFINLINE BOOL XMMatrixIsNaN
+(
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT i, uTest;
+ const UINT *pWork;
+
+ i = 16;
+ pWork = (const UINT *)(&M.m[0][0]);
+ do {
+ // Fetch value into integer unit
+ uTest = pWork[0];
+ // Remove sign
+ uTest &= 0x7FFFFFFFU;
+ // NaN is 0x7F800001 through 0x7FFFFFFF inclusive
+ uTest -= 0x7F800001U;
+ if (uTest<0x007FFFFFU) {
+ break; // NaN found
+ }
+ ++pWork; // Next entry
+ } while (--i);
+ return (i!=0); // i == 0 if nothing matched
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Load in registers
+ XMVECTOR vX = M.r[0];
+ XMVECTOR vY = M.r[1];
+ XMVECTOR vZ = M.r[2];
+ XMVECTOR vW = M.r[3];
+ // Test themselves to check for NaN
+ vX = _mm_cmpneq_ps(vX,vX);
+ vY = _mm_cmpneq_ps(vY,vY);
+ vZ = _mm_cmpneq_ps(vZ,vZ);
+ vW = _mm_cmpneq_ps(vW,vW);
+ // Or all the results
+ vX = _mm_or_ps(vX,vZ);
+ vY = _mm_or_ps(vY,vW);
+ vX = _mm_or_ps(vX,vY);
+ // If any tested true, return true
+ return (_mm_movemask_ps(vX)!=0);
+#else
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Return TRUE if any entry in the matrix is +/-INF
+XMFINLINE BOOL XMMatrixIsInfinite
+(
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT i, uTest;
+ const UINT *pWork;
+
+ i = 16;
+ pWork = (const UINT *)(&M.m[0][0]);
+ do {
+ // Fetch value into integer unit
+ uTest = pWork[0];
+ // Remove sign
+ uTest &= 0x7FFFFFFFU;
+ // INF is 0x7F800000
+ if (uTest==0x7F800000U) {
+ break; // INF found
+ }
+ ++pWork; // Next entry
+ } while (--i);
+ return (i!=0); // i == 0 if nothing matched
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Mask off the sign bits
+ XMVECTOR vTemp1 = _mm_and_ps(M.r[0],g_XMAbsMask);
+ XMVECTOR vTemp2 = _mm_and_ps(M.r[1],g_XMAbsMask);
+ XMVECTOR vTemp3 = _mm_and_ps(M.r[2],g_XMAbsMask);
+ XMVECTOR vTemp4 = _mm_and_ps(M.r[3],g_XMAbsMask);
+ // Compare to infinity
+ vTemp1 = _mm_cmpeq_ps(vTemp1,g_XMInfinity);
+ vTemp2 = _mm_cmpeq_ps(vTemp2,g_XMInfinity);
+ vTemp3 = _mm_cmpeq_ps(vTemp3,g_XMInfinity);
+ vTemp4 = _mm_cmpeq_ps(vTemp4,g_XMInfinity);
+ // Or the answers together
+ vTemp1 = _mm_or_ps(vTemp1,vTemp2);
+ vTemp3 = _mm_or_ps(vTemp3,vTemp4);
+ vTemp1 = _mm_or_ps(vTemp1,vTemp3);
+ // If any are infinity, the signs are true.
+ return (_mm_movemask_ps(vTemp1)!=0);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Return TRUE if the XMMatrix is equal to identity
+XMFINLINE BOOL XMMatrixIsIdentity
+(
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ unsigned int uOne, uZero;
+ const unsigned int *pWork;
+
+ // Use the integer pipeline to reduce branching to a minimum
+ pWork = (const unsigned int*)(&M.m[0][0]);
+ // Convert 1.0f to zero and or them together
+ uOne = pWork[0]^0x3F800000U;
+ // Or all the 0.0f entries together
+ uZero = pWork[1];
+ uZero |= pWork[2];
+ uZero |= pWork[3];
+ // 2nd row
+ uZero |= pWork[4];
+ uOne |= pWork[5]^0x3F800000U;
+ uZero |= pWork[6];
+ uZero |= pWork[7];
+ // 3rd row
+ uZero |= pWork[8];
+ uZero |= pWork[9];
+ uOne |= pWork[10]^0x3F800000U;
+ uZero |= pWork[11];
+ // 4th row
+ uZero |= pWork[12];
+ uZero |= pWork[13];
+ uZero |= pWork[14];
+ uOne |= pWork[15]^0x3F800000U;
+ // If all zero entries are zero, the uZero==0
+ uZero &= 0x7FFFFFFF; // Allow -0.0f
+ // If all 1.0f entries are 1.0f, then uOne==0
+ uOne |= uZero;
+ return (uOne==0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0],g_XMIdentityR0);
+ XMVECTOR vTemp2 = _mm_cmpeq_ps(M.r[1],g_XMIdentityR1);
+ XMVECTOR vTemp3 = _mm_cmpeq_ps(M.r[2],g_XMIdentityR2);
+ XMVECTOR vTemp4 = _mm_cmpeq_ps(M.r[3],g_XMIdentityR3);
+ vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+ vTemp3 = _mm_and_ps(vTemp3,vTemp4);
+ vTemp1 = _mm_and_ps(vTemp1,vTemp3);
+ return (_mm_movemask_ps(vTemp1)==0x0f);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Perform a 4x4 matrix multiply by a 4x4 matrix
+XMFINLINE XMMATRIX XMMatrixMultiply
+(
+ CXMMATRIX M1,
+ CXMMATRIX M2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMMATRIX mResult;
+ // Cache the invariants in registers
+ float x = M1.m[0][0];
+ float y = M1.m[0][1];
+ float z = M1.m[0][2];
+ float w = M1.m[0][3];
+ // Perform the operation on the first row
+ mResult.m[0][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w);
+ mResult.m[0][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w);
+ mResult.m[0][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w);
+ mResult.m[0][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w);
+ // Repeat for all the other rows
+ x = M1.m[1][0];
+ y = M1.m[1][1];
+ z = M1.m[1][2];
+ w = M1.m[1][3];
+ mResult.m[1][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w);
+ mResult.m[1][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w);
+ mResult.m[1][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w);
+ mResult.m[1][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w);
+ x = M1.m[2][0];
+ y = M1.m[2][1];
+ z = M1.m[2][2];
+ w = M1.m[2][3];
+ mResult.m[2][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w);
+ mResult.m[2][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w);
+ mResult.m[2][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w);
+ mResult.m[2][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w);
+ x = M1.m[3][0];
+ y = M1.m[3][1];
+ z = M1.m[3][2];
+ w = M1.m[3][3];
+ mResult.m[3][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w);
+ mResult.m[3][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w);
+ mResult.m[3][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w);
+ mResult.m[3][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w);
+ return mResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX mResult;
+ // Use vW to hold the original row
+ XMVECTOR vW = M1.r[0];
+ // Splat the component X,Y,Z then W
+ XMVECTOR vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
+ XMVECTOR vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
+ XMVECTOR vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
+ vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
+ // Perform the opertion on the first row
+ vX = _mm_mul_ps(vX,M2.r[0]);
+ vY = _mm_mul_ps(vY,M2.r[1]);
+ vZ = _mm_mul_ps(vZ,M2.r[2]);
+ vW = _mm_mul_ps(vW,M2.r[3]);
+ // Perform a binary add to reduce cumulative errors
+ vX = _mm_add_ps(vX,vZ);
+ vY = _mm_add_ps(vY,vW);
+ vX = _mm_add_ps(vX,vY);
+ mResult.r[0] = vX;
+ // Repeat for the other 3 rows
+ vW = M1.r[1];
+ vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
+ vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
+ vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
+ vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
+ vX = _mm_mul_ps(vX,M2.r[0]);
+ vY = _mm_mul_ps(vY,M2.r[1]);
+ vZ = _mm_mul_ps(vZ,M2.r[2]);
+ vW = _mm_mul_ps(vW,M2.r[3]);
+ vX = _mm_add_ps(vX,vZ);
+ vY = _mm_add_ps(vY,vW);
+ vX = _mm_add_ps(vX,vY);
+ mResult.r[1] = vX;
+ vW = M1.r[2];
+ vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
+ vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
+ vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
+ vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
+ vX = _mm_mul_ps(vX,M2.r[0]);
+ vY = _mm_mul_ps(vY,M2.r[1]);
+ vZ = _mm_mul_ps(vZ,M2.r[2]);
+ vW = _mm_mul_ps(vW,M2.r[3]);
+ vX = _mm_add_ps(vX,vZ);
+ vY = _mm_add_ps(vY,vW);
+ vX = _mm_add_ps(vX,vY);
+ mResult.r[2] = vX;
+ vW = M1.r[3];
+ vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
+ vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
+ vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
+ vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
+ vX = _mm_mul_ps(vX,M2.r[0]);
+ vY = _mm_mul_ps(vY,M2.r[1]);
+ vZ = _mm_mul_ps(vZ,M2.r[2]);
+ vW = _mm_mul_ps(vW,M2.r[3]);
+ vX = _mm_add_ps(vX,vZ);
+ vY = _mm_add_ps(vY,vW);
+ vX = _mm_add_ps(vX,vY);
+ mResult.r[3] = vX;
+ return mResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixMultiplyTranspose
+(
+ CXMMATRIX M1,
+ CXMMATRIX M2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMMATRIX mResult;
+ // Cache the invariants in registers
+ float x = M2.m[0][0];
+ float y = M2.m[1][0];
+ float z = M2.m[2][0];
+ float w = M2.m[3][0];
+ // Perform the operation on the first row
+ mResult.m[0][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w);
+ mResult.m[0][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w);
+ mResult.m[0][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w);
+ mResult.m[0][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w);
+ // Repeat for all the other rows
+ x = M2.m[0][1];
+ y = M2.m[1][1];
+ z = M2.m[2][1];
+ w = M2.m[3][1];
+ mResult.m[1][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w);
+ mResult.m[1][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w);
+ mResult.m[1][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w);
+ mResult.m[1][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w);
+ x = M2.m[0][2];
+ y = M2.m[1][2];
+ z = M2.m[2][2];
+ w = M2.m[3][2];
+ mResult.m[2][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w);
+ mResult.m[2][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w);
+ mResult.m[2][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w);
+ mResult.m[2][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w);
+ x = M2.m[0][3];
+ y = M2.m[1][3];
+ z = M2.m[2][3];
+ w = M2.m[3][3];
+ mResult.m[3][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w);
+ mResult.m[3][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w);
+ mResult.m[3][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w);
+ mResult.m[3][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w);
+ return mResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX Product;
+ XMMATRIX Result;
+ Product = XMMatrixMultiply(M1, M2);
+ Result = XMMatrixTranspose(Product);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixTranspose
+(
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX P;
+ XMMATRIX MT;
+
+ // Original matrix:
+ //
+ // m00m01m02m03
+ // m10m11m12m13
+ // m20m21m22m23
+ // m30m31m32m33
+
+ P.r[0] = XMVectorMergeXY(M.r[0], M.r[2]); // m00m20m01m21
+ P.r[1] = XMVectorMergeXY(M.r[1], M.r[3]); // m10m30m11m31
+ P.r[2] = XMVectorMergeZW(M.r[0], M.r[2]); // m02m22m03m23
+ P.r[3] = XMVectorMergeZW(M.r[1], M.r[3]); // m12m32m13m33
+
+ MT.r[0] = XMVectorMergeXY(P.r[0], P.r[1]); // m00m10m20m30
+ MT.r[1] = XMVectorMergeZW(P.r[0], P.r[1]); // m01m11m21m31
+ MT.r[2] = XMVectorMergeXY(P.r[2], P.r[3]); // m02m12m22m32
+ MT.r[3] = XMVectorMergeZW(P.r[2], P.r[3]); // m03m13m23m33
+
+ return MT;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // x.x,x.y,y.x,y.y
+ XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(1,0,1,0));
+ // x.z,x.w,y.z,y.w
+ XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(3,2,3,2));
+ // z.x,z.y,w.x,w.y
+ XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(1,0,1,0));
+ // z.z,z.w,w.z,w.w
+ XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(3,2,3,2));
+ XMMATRIX mResult;
+
+ // x.x,y.x,z.x,w.x
+ mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0));
+ // x.y,y.y,z.y,w.y
+ mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1));
+ // x.z,y.z,z.z,w.z
+ mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0));
+ // x.w,y.w,z.w,w.w
+ mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1));
+ return mResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Return the inverse and the determinant of a 4x4 matrix
+XMINLINE XMMATRIX XMMatrixInverse
+(
+ XMVECTOR* pDeterminant,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX R;
+ XMMATRIX MT;
+ XMVECTOR D0, D1, D2;
+ XMVECTOR C0, C1, C2, C3, C4, C5, C6, C7;
+ XMVECTOR V0[4], V1[4];
+ XMVECTOR Determinant;
+ XMVECTOR Reciprocal;
+ XMMATRIX Result;
+ static CONST XMVECTORU32 SwizzleXXYY = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0Y};
+ static CONST XMVECTORU32 SwizzleZWZW = {XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_0Z, XM_PERMUTE_0W};
+ static CONST XMVECTORU32 SwizzleYZXY = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0Y};
+ static CONST XMVECTORU32 SwizzleZWYZ = {XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_0Y, XM_PERMUTE_0Z};
+ static CONST XMVECTORU32 SwizzleWXWX = {XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0X};
+ static CONST XMVECTORU32 SwizzleZXYX = {XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0X};
+ static CONST XMVECTORU32 SwizzleYWXZ = {XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Z};
+ static CONST XMVECTORU32 SwizzleWZWY = {XM_PERMUTE_0W, XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_0Y};
+ static CONST XMVECTORU32 Permute0X0Z1X1Z = {XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Z};
+ static CONST XMVECTORU32 Permute0Y0W1Y1W = {XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_1W};
+ static CONST XMVECTORU32 Permute1Y0Y0W0X = {XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X};
+ static CONST XMVECTORU32 Permute0W0X0Y1X = {XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1X};
+ static CONST XMVECTORU32 Permute0Z1Y1X0Z = {XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1X, XM_PERMUTE_0Z};
+ static CONST XMVECTORU32 Permute0W1Y0Y0Z = {XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0Z};
+ static CONST XMVECTORU32 Permute0Z0Y1X0X = {XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0X};
+ static CONST XMVECTORU32 Permute1Y0X0W1X = {XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1X};
+ static CONST XMVECTORU32 Permute1W0Y0W0X = {XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X};
+ static CONST XMVECTORU32 Permute0W0X0Y1Z = {XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1Z};
+ static CONST XMVECTORU32 Permute0Z1W1Z0Z = {XM_PERMUTE_0Z, XM_PERMUTE_1W, XM_PERMUTE_1Z, XM_PERMUTE_0Z};
+ static CONST XMVECTORU32 Permute0W1W0Y0Z = {XM_PERMUTE_0W, XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0Z};
+ static CONST XMVECTORU32 Permute0Z0Y1Z0X = {XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1Z, XM_PERMUTE_0X};
+ static CONST XMVECTORU32 Permute1W0X0W1Z = {XM_PERMUTE_1W, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1Z};
+
+ MT = XMMatrixTranspose(M);
+
+ V0[0] = XMVectorPermute(MT.r[2], MT.r[2], SwizzleXXYY.v);
+ V1[0] = XMVectorPermute(MT.r[3], MT.r[3], SwizzleZWZW.v);
+ V0[1] = XMVectorPermute(MT.r[0], MT.r[0], SwizzleXXYY.v);
+ V1[1] = XMVectorPermute(MT.r[1], MT.r[1], SwizzleZWZW.v);
+ V0[2] = XMVectorPermute(MT.r[2], MT.r[0], Permute0X0Z1X1Z.v);
+ V1[2] = XMVectorPermute(MT.r[3], MT.r[1], Permute0Y0W1Y1W.v);
+
+ D0 = XMVectorMultiply(V0[0], V1[0]);
+ D1 = XMVectorMultiply(V0[1], V1[1]);
+ D2 = XMVectorMultiply(V0[2], V1[2]);
+
+ V0[0] = XMVectorPermute(MT.r[2], MT.r[2], SwizzleZWZW.v);
+ V1[0] = XMVectorPermute(MT.r[3], MT.r[3], SwizzleXXYY.v);
+ V0[1] = XMVectorPermute(MT.r[0], MT.r[0], SwizzleZWZW.v);
+ V1[1] = XMVectorPermute(MT.r[1], MT.r[1], SwizzleXXYY.v);
+ V0[2] = XMVectorPermute(MT.r[2], MT.r[0], Permute0Y0W1Y1W.v);
+ V1[2] = XMVectorPermute(MT.r[3], MT.r[1], Permute0X0Z1X1Z.v);
+
+ D0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], D0);
+ D1 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], D1);
+ D2 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], D2);
+
+ V0[0] = XMVectorPermute(MT.r[1], MT.r[1], SwizzleYZXY.v);
+ V1[0] = XMVectorPermute(D0, D2, Permute1Y0Y0W0X.v);
+ V0[1] = XMVectorPermute(MT.r[0], MT.r[0], SwizzleZXYX.v);
+ V1[1] = XMVectorPermute(D0, D2, Permute0W1Y0Y0Z.v);
+ V0[2] = XMVectorPermute(MT.r[3], MT.r[3], SwizzleYZXY.v);
+ V1[2] = XMVectorPermute(D1, D2, Permute1W0Y0W0X.v);
+ V0[3] = XMVectorPermute(MT.r[2], MT.r[2], SwizzleZXYX.v);
+ V1[3] = XMVectorPermute(D1, D2, Permute0W1W0Y0Z.v);
+
+ C0 = XMVectorMultiply(V0[0], V1[0]);
+ C2 = XMVectorMultiply(V0[1], V1[1]);
+ C4 = XMVectorMultiply(V0[2], V1[2]);
+ C6 = XMVectorMultiply(V0[3], V1[3]);
+
+ V0[0] = XMVectorPermute(MT.r[1], MT.r[1], SwizzleZWYZ.v);
+ V1[0] = XMVectorPermute(D0, D2, Permute0W0X0Y1X.v);
+ V0[1] = XMVectorPermute(MT.r[0], MT.r[0], SwizzleWZWY.v);
+ V1[1] = XMVectorPermute(D0, D2, Permute0Z0Y1X0X.v);
+ V0[2] = XMVectorPermute(MT.r[3], MT.r[3], SwizzleZWYZ.v);
+ V1[2] = XMVectorPermute(D1, D2, Permute0W0X0Y1Z.v);
+ V0[3] = XMVectorPermute(MT.r[2], MT.r[2], SwizzleWZWY.v);
+ V1[3] = XMVectorPermute(D1, D2, Permute0Z0Y1Z0X.v);
+
+ C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0);
+ C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2);
+ C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4);
+ C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6);
+
+ V0[0] = XMVectorPermute(MT.r[1], MT.r[1], SwizzleWXWX.v);
+ V1[0] = XMVectorPermute(D0, D2, Permute0Z1Y1X0Z.v);
+ V0[1] = XMVectorPermute(MT.r[0], MT.r[0], SwizzleYWXZ.v);
+ V1[1] = XMVectorPermute(D0, D2, Permute1Y0X0W1X.v);
+ V0[2] = XMVectorPermute(MT.r[3], MT.r[3], SwizzleWXWX.v);
+ V1[2] = XMVectorPermute(D1, D2, Permute0Z1W1Z0Z.v);
+ V0[3] = XMVectorPermute(MT.r[2], MT.r[2], SwizzleYWXZ.v);
+ V1[3] = XMVectorPermute(D1, D2, Permute1W0X0W1Z.v);
+
+ C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0);
+ C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0);
+ C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2);
+ C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2);
+ C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4);
+ C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4);
+ C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6);
+ C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6);
+
+ R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v);
+ R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v);
+ R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v);
+ R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v);
+
+ Determinant = XMVector4Dot(R.r[0], MT.r[0]);
+
+ if (pDeterminant)
+ *pDeterminant = Determinant;
+
+ Reciprocal = XMVectorReciprocal(Determinant);
+
+ Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal);
+ Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal);
+ Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal);
+ Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX MT = XMMatrixTranspose(M);
+ XMVECTOR V00 = _mm_shuffle_ps(MT.r[2], MT.r[2],_MM_SHUFFLE(1,1,0,0));
+ XMVECTOR V10 = _mm_shuffle_ps(MT.r[3], MT.r[3],_MM_SHUFFLE(3,2,3,2));
+ XMVECTOR V01 = _mm_shuffle_ps(MT.r[0], MT.r[0],_MM_SHUFFLE(1,1,0,0));
+ XMVECTOR V11 = _mm_shuffle_ps(MT.r[1], MT.r[1],_MM_SHUFFLE(3,2,3,2));
+ XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0],_MM_SHUFFLE(2,0,2,0));
+ XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1],_MM_SHUFFLE(3,1,3,1));
+
+ XMVECTOR D0 = _mm_mul_ps(V00,V10);
+ XMVECTOR D1 = _mm_mul_ps(V01,V11);
+ XMVECTOR D2 = _mm_mul_ps(V02,V12);
+
+ V00 = _mm_shuffle_ps(MT.r[2],MT.r[2],_MM_SHUFFLE(3,2,3,2));
+ V10 = _mm_shuffle_ps(MT.r[3],MT.r[3],_MM_SHUFFLE(1,1,0,0));
+ V01 = _mm_shuffle_ps(MT.r[0],MT.r[0],_MM_SHUFFLE(3,2,3,2));
+ V11 = _mm_shuffle_ps(MT.r[1],MT.r[1],_MM_SHUFFLE(1,1,0,0));
+ V02 = _mm_shuffle_ps(MT.r[2],MT.r[0],_MM_SHUFFLE(3,1,3,1));
+ V12 = _mm_shuffle_ps(MT.r[3],MT.r[1],_MM_SHUFFLE(2,0,2,0));
+
+ V00 = _mm_mul_ps(V00,V10);
+ V01 = _mm_mul_ps(V01,V11);
+ V02 = _mm_mul_ps(V02,V12);
+ D0 = _mm_sub_ps(D0,V00);
+ D1 = _mm_sub_ps(D1,V01);
+ D2 = _mm_sub_ps(D2,V02);
+ // V11 = D0Y,D0W,D2Y,D2Y
+ V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,1,3,1));
+ V00 = _mm_shuffle_ps(MT.r[1], MT.r[1],_MM_SHUFFLE(1,0,2,1));
+ V10 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(0,3,0,2));
+ V01 = _mm_shuffle_ps(MT.r[0], MT.r[0],_MM_SHUFFLE(0,1,0,2));
+ V11 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(2,1,2,1));
+ // V13 = D1Y,D1W,D2W,D2W
+ XMVECTOR V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,3,3,1));
+ V02 = _mm_shuffle_ps(MT.r[3], MT.r[3],_MM_SHUFFLE(1,0,2,1));
+ V12 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(0,3,0,2));
+ XMVECTOR V03 = _mm_shuffle_ps(MT.r[2], MT.r[2],_MM_SHUFFLE(0,1,0,2));
+ V13 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(2,1,2,1));
+
+ XMVECTOR C0 = _mm_mul_ps(V00,V10);
+ XMVECTOR C2 = _mm_mul_ps(V01,V11);
+ XMVECTOR C4 = _mm_mul_ps(V02,V12);
+ XMVECTOR C6 = _mm_mul_ps(V03,V13);
+
+ // V11 = D0X,D0Y,D2X,D2X
+ V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(0,0,1,0));
+ V00 = _mm_shuffle_ps(MT.r[1], MT.r[1],_MM_SHUFFLE(2,1,3,2));
+ V10 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(2,1,0,3));
+ V01 = _mm_shuffle_ps(MT.r[0], MT.r[0],_MM_SHUFFLE(1,3,2,3));
+ V11 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(0,2,1,2));
+ // V13 = D1X,D1Y,D2Z,D2Z
+ V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(2,2,1,0));
+ V02 = _mm_shuffle_ps(MT.r[3], MT.r[3],_MM_SHUFFLE(2,1,3,2));
+ V12 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(2,1,0,3));
+ V03 = _mm_shuffle_ps(MT.r[2], MT.r[2],_MM_SHUFFLE(1,3,2,3));
+ V13 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(0,2,1,2));
+
+ V00 = _mm_mul_ps(V00,V10);
+ V01 = _mm_mul_ps(V01,V11);
+ V02 = _mm_mul_ps(V02,V12);
+ V03 = _mm_mul_ps(V03,V13);
+ C0 = _mm_sub_ps(C0,V00);
+ C2 = _mm_sub_ps(C2,V01);
+ C4 = _mm_sub_ps(C4,V02);
+ C6 = _mm_sub_ps(C6,V03);
+
+ V00 = _mm_shuffle_ps(MT.r[1],MT.r[1],_MM_SHUFFLE(0,3,0,3));
+ // V10 = D0Z,D0Z,D2X,D2Y
+ V10 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,2,2));
+ V10 = _mm_shuffle_ps(V10,V10,_MM_SHUFFLE(0,2,3,0));
+ V01 = _mm_shuffle_ps(MT.r[0],MT.r[0],_MM_SHUFFLE(2,0,3,1));
+ // V11 = D0X,D0W,D2X,D2Y
+ V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,3,0));
+ V11 = _mm_shuffle_ps(V11,V11,_MM_SHUFFLE(2,1,0,3));
+ V02 = _mm_shuffle_ps(MT.r[3],MT.r[3],_MM_SHUFFLE(0,3,0,3));
+ // V12 = D1Z,D1Z,D2Z,D2W
+ V12 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,2,2));
+ V12 = _mm_shuffle_ps(V12,V12,_MM_SHUFFLE(0,2,3,0));
+ V03 = _mm_shuffle_ps(MT.r[2],MT.r[2],_MM_SHUFFLE(2,0,3,1));
+ // V13 = D1X,D1W,D2Z,D2W
+ V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,3,0));
+ V13 = _mm_shuffle_ps(V13,V13,_MM_SHUFFLE(2,1,0,3));
+
+ V00 = _mm_mul_ps(V00,V10);
+ V01 = _mm_mul_ps(V01,V11);
+ V02 = _mm_mul_ps(V02,V12);
+ V03 = _mm_mul_ps(V03,V13);
+ XMVECTOR C1 = _mm_sub_ps(C0,V00);
+ C0 = _mm_add_ps(C0,V00);
+ XMVECTOR C3 = _mm_add_ps(C2,V01);
+ C2 = _mm_sub_ps(C2,V01);
+ XMVECTOR C5 = _mm_sub_ps(C4,V02);
+ C4 = _mm_add_ps(C4,V02);
+ XMVECTOR C7 = _mm_add_ps(C6,V03);
+ C6 = _mm_sub_ps(C6,V03);
+
+ C0 = _mm_shuffle_ps(C0,C1,_MM_SHUFFLE(3,1,2,0));
+ C2 = _mm_shuffle_ps(C2,C3,_MM_SHUFFLE(3,1,2,0));
+ C4 = _mm_shuffle_ps(C4,C5,_MM_SHUFFLE(3,1,2,0));
+ C6 = _mm_shuffle_ps(C6,C7,_MM_SHUFFLE(3,1,2,0));
+ C0 = _mm_shuffle_ps(C0,C0,_MM_SHUFFLE(3,1,2,0));
+ C2 = _mm_shuffle_ps(C2,C2,_MM_SHUFFLE(3,1,2,0));
+ C4 = _mm_shuffle_ps(C4,C4,_MM_SHUFFLE(3,1,2,0));
+ C6 = _mm_shuffle_ps(C6,C6,_MM_SHUFFLE(3,1,2,0));
+ // Get the determinate
+ XMVECTOR vTemp = XMVector4Dot(C0,MT.r[0]);
+ if (pDeterminant)
+ *pDeterminant = vTemp;
+ vTemp = _mm_div_ps(g_XMOne,vTemp);
+ XMMATRIX mResult;
+ mResult.r[0] = _mm_mul_ps(C0,vTemp);
+ mResult.r[1] = _mm_mul_ps(C2,vTemp);
+ mResult.r[2] = _mm_mul_ps(C4,vTemp);
+ mResult.r[3] = _mm_mul_ps(C6,vTemp);
+ return mResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMMatrixDeterminant
+(
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V0, V1, V2, V3, V4, V5;
+ XMVECTOR P0, P1, P2, R, S;
+ XMVECTOR Result;
+ static CONST XMVECTORU32 SwizzleYXXX = {XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X};
+ static CONST XMVECTORU32 SwizzleZZYY = {XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_0Y};
+ static CONST XMVECTORU32 SwizzleWWWZ = {XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0Z};
+ static CONST XMVECTOR Sign = {1.0f, -1.0f, 1.0f, -1.0f};
+
+ V0 = XMVectorPermute(M.r[2], M.r[2], SwizzleYXXX.v);
+ V1 = XMVectorPermute(M.r[3], M.r[3], SwizzleZZYY.v);
+ V2 = XMVectorPermute(M.r[2], M.r[2], SwizzleYXXX.v);
+ V3 = XMVectorPermute(M.r[3], M.r[3], SwizzleWWWZ.v);
+ V4 = XMVectorPermute(M.r[2], M.r[2], SwizzleZZYY.v);
+ V5 = XMVectorPermute(M.r[3], M.r[3], SwizzleWWWZ.v);
+
+ P0 = XMVectorMultiply(V0, V1);
+ P1 = XMVectorMultiply(V2, V3);
+ P2 = XMVectorMultiply(V4, V5);
+
+ V0 = XMVectorPermute(M.r[2], M.r[2], SwizzleZZYY.v);
+ V1 = XMVectorPermute(M.r[3], M.r[3], SwizzleYXXX.v);
+ V2 = XMVectorPermute(M.r[2], M.r[2], SwizzleWWWZ.v);
+ V3 = XMVectorPermute(M.r[3], M.r[3], SwizzleYXXX.v);
+ V4 = XMVectorPermute(M.r[2], M.r[2], SwizzleWWWZ.v);
+ V5 = XMVectorPermute(M.r[3], M.r[3], SwizzleZZYY.v);
+
+ P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0);
+ P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1);
+ P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2);
+
+ V0 = XMVectorPermute(M.r[1], M.r[1], SwizzleWWWZ.v);
+ V1 = XMVectorPermute(M.r[1], M.r[1], SwizzleZZYY.v);
+ V2 = XMVectorPermute(M.r[1], M.r[1], SwizzleYXXX.v);
+
+ S = XMVectorMultiply(M.r[0], Sign);
+ R = XMVectorMultiply(V0, P0);
+ R = XMVectorNegativeMultiplySubtract(V1, P1, R);
+ R = XMVectorMultiplyAdd(V2, P2, R);
+
+ Result = XMVector4Dot(S, R);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR V0, V1, V2, V3, V4, V5;
+ XMVECTOR P0, P1, P2, R, S;
+ XMVECTOR Result;
+ static CONST XMVECTORU32 SwizzleYXXX = {XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X};
+ static CONST XMVECTORU32 SwizzleZZYY = {XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_0Y};
+ static CONST XMVECTORU32 SwizzleWWWZ = {XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0Z};
+ static CONST XMVECTORF32 Sign = {1.0f, -1.0f, 1.0f, -1.0f};
+
+ V0 = XMVectorPermute(M.r[2], M.r[2], SwizzleYXXX);
+ V1 = XMVectorPermute(M.r[3], M.r[3], SwizzleZZYY);
+ V2 = XMVectorPermute(M.r[2], M.r[2], SwizzleYXXX);
+ V3 = XMVectorPermute(M.r[3], M.r[3], SwizzleWWWZ);
+ V4 = XMVectorPermute(M.r[2], M.r[2], SwizzleZZYY);
+ V5 = XMVectorPermute(M.r[3], M.r[3], SwizzleWWWZ);
+
+ P0 = _mm_mul_ps(V0, V1);
+ P1 = _mm_mul_ps(V2, V3);
+ P2 = _mm_mul_ps(V4, V5);
+
+ V0 = XMVectorPermute(M.r[2], M.r[2], SwizzleZZYY);
+ V1 = XMVectorPermute(M.r[3], M.r[3], SwizzleYXXX);
+ V2 = XMVectorPermute(M.r[2], M.r[2], SwizzleWWWZ);
+ V3 = XMVectorPermute(M.r[3], M.r[3], SwizzleYXXX);
+ V4 = XMVectorPermute(M.r[2], M.r[2], SwizzleWWWZ);
+ V5 = XMVectorPermute(M.r[3], M.r[3], SwizzleZZYY);
+
+ P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0);
+ P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1);
+ P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2);
+
+ V0 = XMVectorPermute(M.r[1], M.r[1], SwizzleWWWZ);
+ V1 = XMVectorPermute(M.r[1], M.r[1], SwizzleZZYY);
+ V2 = XMVectorPermute(M.r[1], M.r[1], SwizzleYXXX);
+
+ S = _mm_mul_ps(M.r[0], Sign);
+ R = _mm_mul_ps(V0, P0);
+ R = XMVectorNegativeMultiplySubtract(V1, P1, R);
+ R = XMVectorMultiplyAdd(V2, P2, R);
+
+ Result = XMVector4Dot(S, R);
+
+ return Result;
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+#undef XMRANKDECOMPOSE
+#undef XM_DECOMP_EPSILON
+
+#define XMRANKDECOMPOSE(a, b, c, x, y, z) \
+ if((x) < (y)) \
+ { \
+ if((y) < (z)) \
+ { \
+ (a) = 2; \
+ (b) = 1; \
+ (c) = 0; \
+ } \
+ else \
+ { \
+ (a) = 1; \
+ \
+ if((x) < (z)) \
+ { \
+ (b) = 2; \
+ (c) = 0; \
+ } \
+ else \
+ { \
+ (b) = 0; \
+ (c) = 2; \
+ } \
+ } \
+ } \
+ else \
+ { \
+ if((x) < (z)) \
+ { \
+ (a) = 2; \
+ (b) = 0; \
+ (c) = 1; \
+ } \
+ else \
+ { \
+ (a) = 0; \
+ \
+ if((y) < (z)) \
+ { \
+ (b) = 2; \
+ (c) = 1; \
+ } \
+ else \
+ { \
+ (b) = 1; \
+ (c) = 2; \
+ } \
+ } \
+ }
+
+#define XM_DECOMP_EPSILON 0.0001f
+
+XMINLINE BOOL XMMatrixDecompose
+(
+ XMVECTOR *outScale,
+ XMVECTOR *outRotQuat,
+ XMVECTOR *outTrans,
+ CXMMATRIX M
+)
+{
+ FLOAT fDet;
+ FLOAT *pfScales;
+ XMVECTOR *ppvBasis[3];
+ XMMATRIX matTemp;
+ UINT a, b, c;
+ static const XMVECTOR *pvCanonicalBasis[3] = {
+ &g_XMIdentityR0.v,
+ &g_XMIdentityR1.v,
+ &g_XMIdentityR2.v
+ };
+
+ XMASSERT( outScale != NULL );
+ XMASSERT( outRotQuat != NULL );
+ XMASSERT( outTrans != NULL );
+
+ // Get the translation
+ outTrans[0] = M.r[3];
+
+ ppvBasis[0] = &matTemp.r[0];
+ ppvBasis[1] = &matTemp.r[1];
+ ppvBasis[2] = &matTemp.r[2];
+
+ matTemp.r[0] = M.r[0];
+ matTemp.r[1] = M.r[1];
+ matTemp.r[2] = M.r[2];
+ matTemp.r[3] = g_XMIdentityR3.v;
+
+ pfScales = (FLOAT *)outScale;
+
+ XMVectorGetXPtr(&pfScales[0],XMVector3Length(ppvBasis[0][0]));
+ XMVectorGetXPtr(&pfScales[1],XMVector3Length(ppvBasis[1][0]));
+ XMVectorGetXPtr(&pfScales[2],XMVector3Length(ppvBasis[2][0]));
+ pfScales[3] = 0.f;
+
+ XMRANKDECOMPOSE(a, b, c, pfScales[0], pfScales[1], pfScales[2])
+
+ if(pfScales[a] < XM_DECOMP_EPSILON)
+ {
+ ppvBasis[a][0] = pvCanonicalBasis[a][0];
+ }
+ ppvBasis[a][0] = XMVector3Normalize(ppvBasis[a][0]);
+
+ if(pfScales[b] < XM_DECOMP_EPSILON)
+ {
+ UINT aa, bb, cc;
+ FLOAT fAbsX, fAbsY, fAbsZ;
+
+ fAbsX = fabsf(XMVectorGetX(ppvBasis[a][0]));
+ fAbsY = fabsf(XMVectorGetY(ppvBasis[a][0]));
+ fAbsZ = fabsf(XMVectorGetZ(ppvBasis[a][0]));
+
+ XMRANKDECOMPOSE(aa, bb, cc, fAbsX, fAbsY, fAbsZ)
+
+ ppvBasis[b][0] = XMVector3Cross(ppvBasis[a][0],pvCanonicalBasis[cc][0]);
+ }
+
+ ppvBasis[b][0] = XMVector3Normalize(ppvBasis[b][0]);
+
+ if(pfScales[c] < XM_DECOMP_EPSILON)
+ {
+ ppvBasis[c][0] = XMVector3Cross(ppvBasis[a][0],ppvBasis[b][0]);
+ }
+
+ ppvBasis[c][0] = XMVector3Normalize(ppvBasis[c][0]);
+
+ fDet = XMVectorGetX(XMMatrixDeterminant(matTemp));
+
+ // use Kramer's rule to check for handedness of coordinate system
+ if(fDet < 0.0f)
+ {
+ // switch coordinate system by negating the scale and inverting the basis vector on the x-axis
+ pfScales[a] = -pfScales[a];
+ ppvBasis[a][0] = XMVectorNegate(ppvBasis[a][0]);
+
+ fDet = -fDet;
+ }
+
+ fDet -= 1.0f;
+ fDet *= fDet;
+
+ if(XM_DECOMP_EPSILON < fDet)
+ {
+ // Non-SRT matrix encountered
+ return FALSE;
+ }
+
+ // generate the quaternion from the matrix
+ outRotQuat[0] = XMQuaternionRotationMatrix(matTemp);
+ return TRUE;
+}
+
+#undef XMRANKDECOMPOSE
+#undef XM_DECOMP_EPSILON
+
+//------------------------------------------------------------------------------
+// Transformation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixIdentity()
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX M;
+ M.r[0] = g_XMIdentityR0.v;
+ M.r[1] = g_XMIdentityR1.v;
+ M.r[2] = g_XMIdentityR2.v;
+ M.r[3] = g_XMIdentityR3.v;
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+ M.r[0] = g_XMIdentityR0;
+ M.r[1] = g_XMIdentityR1;
+ M.r[2] = g_XMIdentityR2;
+ M.r[3] = g_XMIdentityR3;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixSet
+(
+ FLOAT m00, FLOAT m01, FLOAT m02, FLOAT m03,
+ FLOAT m10, FLOAT m11, FLOAT m12, FLOAT m13,
+ FLOAT m20, FLOAT m21, FLOAT m22, FLOAT m23,
+ FLOAT m30, FLOAT m31, FLOAT m32, FLOAT m33
+)
+{
+ XMMATRIX M;
+
+ M.r[0] = XMVectorSet(m00, m01, m02, m03);
+ M.r[1] = XMVectorSet(m10, m11, m12, m13);
+ M.r[2] = XMVectorSet(m20, m21, m22, m23);
+ M.r[3] = XMVectorSet(m30, m31, m32, m33);
+
+ return M;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixTranslation
+(
+ FLOAT OffsetX,
+ FLOAT OffsetY,
+ FLOAT OffsetZ
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX M;
+
+ M.m[0][0] = 1.0f;
+ M.m[0][1] = 0.0f;
+ M.m[0][2] = 0.0f;
+ M.m[0][3] = 0.0f;
+
+ M.m[1][0] = 0.0f;
+ M.m[1][1] = 1.0f;
+ M.m[1][2] = 0.0f;
+ M.m[1][3] = 0.0f;
+
+ M.m[2][0] = 0.0f;
+ M.m[2][1] = 0.0f;
+ M.m[2][2] = 1.0f;
+ M.m[2][3] = 0.0f;
+
+ M.m[3][0] = OffsetX;
+ M.m[3][1] = OffsetY;
+ M.m[3][2] = OffsetZ;
+ M.m[3][3] = 1.0f;
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+ M.r[0] = g_XMIdentityR0;
+ M.r[1] = g_XMIdentityR1;
+ M.r[2] = g_XMIdentityR2;
+ M.r[3] = _mm_set_ps(1.0f,OffsetZ,OffsetY,OffsetX);
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixTranslationFromVector
+(
+ FXMVECTOR Offset
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX M;
+ M.m[0][0] = 1.0f;
+ M.m[0][1] = 0.0f;
+ M.m[0][2] = 0.0f;
+ M.m[0][3] = 0.0f;
+
+ M.m[1][0] = 0.0f;
+ M.m[1][1] = 1.0f;
+ M.m[1][2] = 0.0f;
+ M.m[1][3] = 0.0f;
+
+ M.m[2][0] = 0.0f;
+ M.m[2][1] = 0.0f;
+ M.m[2][2] = 1.0f;
+ M.m[2][3] = 0.0f;
+
+ M.m[3][0] = Offset.vector4_f32[0];
+ M.m[3][1] = Offset.vector4_f32[1];
+ M.m[3][2] = Offset.vector4_f32[2];
+ M.m[3][3] = 1.0f;
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_and_ps(Offset,g_XMMask3);
+ vTemp = _mm_or_ps(vTemp,g_XMIdentityR3);
+ XMMATRIX M;
+ M.r[0] = g_XMIdentityR0;
+ M.r[1] = g_XMIdentityR1;
+ M.r[2] = g_XMIdentityR2;
+ M.r[3] = vTemp;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixScaling
+(
+ FLOAT ScaleX,
+ FLOAT ScaleY,
+ FLOAT ScaleZ
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX M;
+
+ M.r[0] = XMVectorSet(ScaleX, 0.0f, 0.0f, 0.0f);
+ M.r[1] = XMVectorSet(0.0f, ScaleY, 0.0f, 0.0f);
+ M.r[2] = XMVectorSet(0.0f, 0.0f, ScaleZ, 0.0f);
+
+ M.r[3] = g_XMIdentityR3.v;
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+ M.r[0] = _mm_set_ps( 0, 0, 0, ScaleX );
+ M.r[1] = _mm_set_ps( 0, 0, ScaleY, 0 );
+ M.r[2] = _mm_set_ps( 0, ScaleZ, 0, 0 );
+ M.r[3] = g_XMIdentityR3;
+ return M;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixScalingFromVector
+(
+ FXMVECTOR Scale
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMMATRIX M;
+ M.m[0][0] = Scale.vector4_f32[0];
+ M.m[0][1] = 0.0f;
+ M.m[0][2] = 0.0f;
+ M.m[0][3] = 0.0f;
+
+ M.m[1][0] = 0.0f;
+ M.m[1][1] = Scale.vector4_f32[1];
+ M.m[1][2] = 0.0f;
+ M.m[1][3] = 0.0f;
+
+ M.m[2][0] = 0.0f;
+ M.m[2][1] = 0.0f;
+ M.m[2][2] = Scale.vector4_f32[2];
+ M.m[2][3] = 0.0f;
+
+ M.m[3][0] = 0.0f;
+ M.m[3][1] = 0.0f;
+ M.m[3][2] = 0.0f;
+ M.m[3][3] = 1.0f;
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+ M.r[0] = _mm_and_ps(Scale,g_XMMaskX);
+ M.r[1] = _mm_and_ps(Scale,g_XMMaskY);
+ M.r[2] = _mm_and_ps(Scale,g_XMMaskZ);
+ M.r[3] = g_XMIdentityR3;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMMATRIX XMMatrixRotationX
+(
+ FLOAT Angle
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMMATRIX M;
+
+ FLOAT fSinAngle = sinf(Angle);
+ FLOAT fCosAngle = cosf(Angle);
+
+ M.m[0][0] = 1.0f;
+ M.m[0][1] = 0.0f;
+ M.m[0][2] = 0.0f;
+ M.m[0][3] = 0.0f;
+
+ M.m[1][0] = 0.0f;
+ M.m[1][1] = fCosAngle;
+ M.m[1][2] = fSinAngle;
+ M.m[1][3] = 0.0f;
+
+ M.m[2][0] = 0.0f;
+ M.m[2][1] = -fSinAngle;
+ M.m[2][2] = fCosAngle;
+ M.m[2][3] = 0.0f;
+
+ M.m[3][0] = 0.0f;
+ M.m[3][1] = 0.0f;
+ M.m[3][2] = 0.0f;
+ M.m[3][3] = 1.0f;
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ FLOAT SinAngle = sinf(Angle);
+ FLOAT CosAngle = cosf(Angle);
+
+ XMVECTOR vSin = _mm_set_ss(SinAngle);
+ XMVECTOR vCos = _mm_set_ss(CosAngle);
+ // x = 0,y = cos,z = sin, w = 0
+ vCos = _mm_shuffle_ps(vCos,vSin,_MM_SHUFFLE(3,0,0,3));
+ XMMATRIX M;
+ M.r[0] = g_XMIdentityR0;
+ M.r[1] = vCos;
+ // x = 0,y = sin,z = cos, w = 0
+ vCos = _mm_shuffle_ps(vCos,vCos,_MM_SHUFFLE(3,1,2,0));
+ // x = 0,y = -sin,z = cos, w = 0
+ vCos = _mm_mul_ps(vCos,g_XMNegateY);
+ M.r[2] = vCos;
+ M.r[3] = g_XMIdentityR3;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMMATRIX XMMatrixRotationY
+(
+ FLOAT Angle
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMMATRIX M;
+
+ FLOAT fSinAngle = sinf(Angle);
+ FLOAT fCosAngle = cosf(Angle);
+
+ M.m[0][0] = fCosAngle;
+ M.m[0][1] = 0.0f;
+ M.m[0][2] = -fSinAngle;
+ M.m[0][3] = 0.0f;
+
+ M.m[1][0] = 0.0f;
+ M.m[1][1] = 1.0f;
+ M.m[1][2] = 0.0f;
+ M.m[1][3] = 0.0f;
+
+ M.m[2][0] = fSinAngle;
+ M.m[2][1] = 0.0f;
+ M.m[2][2] = fCosAngle;
+ M.m[2][3] = 0.0f;
+
+ M.m[3][0] = 0.0f;
+ M.m[3][1] = 0.0f;
+ M.m[3][2] = 0.0f;
+ M.m[3][3] = 1.0f;
+ return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+ FLOAT SinAngle = sinf(Angle);
+ FLOAT CosAngle = cosf(Angle);
+
+ XMVECTOR vSin = _mm_set_ss(SinAngle);
+ XMVECTOR vCos = _mm_set_ss(CosAngle);
+ // x = sin,y = 0,z = cos, w = 0
+ vSin = _mm_shuffle_ps(vSin,vCos,_MM_SHUFFLE(3,0,3,0));
+ XMMATRIX M;
+ M.r[2] = vSin;
+ M.r[1] = g_XMIdentityR1;
+ // x = cos,y = 0,z = sin, w = 0
+ vSin = _mm_shuffle_ps(vSin,vSin,_MM_SHUFFLE(3,0,1,2));
+ // x = cos,y = 0,z = -sin, w = 0
+ vSin = _mm_mul_ps(vSin,g_XMNegateZ);
+ M.r[0] = vSin;
+ M.r[3] = g_XMIdentityR3;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMMATRIX XMMatrixRotationZ
+(
+ FLOAT Angle
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMMATRIX M;
+
+ FLOAT fSinAngle = sinf(Angle);
+ FLOAT fCosAngle = cosf(Angle);
+
+ M.m[0][0] = fCosAngle;
+ M.m[0][1] = fSinAngle;
+ M.m[0][2] = 0.0f;
+ M.m[0][3] = 0.0f;
+
+ M.m[1][0] = -fSinAngle;
+ M.m[1][1] = fCosAngle;
+ M.m[1][2] = 0.0f;
+ M.m[1][3] = 0.0f;
+
+ M.m[2][0] = 0.0f;
+ M.m[2][1] = 0.0f;
+ M.m[2][2] = 1.0f;
+ M.m[2][3] = 0.0f;
+
+ M.m[3][0] = 0.0f;
+ M.m[3][1] = 0.0f;
+ M.m[3][2] = 0.0f;
+ M.m[3][3] = 1.0f;
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ FLOAT SinAngle = sinf(Angle);
+ FLOAT CosAngle = cosf(Angle);
+
+ XMVECTOR vSin = _mm_set_ss(SinAngle);
+ XMVECTOR vCos = _mm_set_ss(CosAngle);
+ // x = cos,y = sin,z = 0, w = 0
+ vCos = _mm_unpacklo_ps(vCos,vSin);
+ XMMATRIX M;
+ M.r[0] = vCos;
+ // x = sin,y = cos,z = 0, w = 0
+ vCos = _mm_shuffle_ps(vCos,vCos,_MM_SHUFFLE(3,2,0,1));
+ // x = cos,y = -sin,z = 0, w = 0
+ vCos = _mm_mul_ps(vCos,g_XMNegateX);
+ M.r[1] = vCos;
+ M.r[2] = g_XMIdentityR2;
+ M.r[3] = g_XMIdentityR3;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMMATRIX XMMatrixRotationRollPitchYaw
+(
+ FLOAT Pitch,
+ FLOAT Yaw,
+ FLOAT Roll
+)
+{
+ XMVECTOR Angles;
+ XMMATRIX M;
+
+ Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
+ M = XMMatrixRotationRollPitchYawFromVector(Angles);
+
+ return M;
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMMATRIX XMMatrixRotationRollPitchYawFromVector
+(
+ FXMVECTOR Angles // <Pitch, Yaw, Roll, undefined>
+)
+{
+ XMVECTOR Q;
+ XMMATRIX M;
+
+ Q = XMQuaternionRotationRollPitchYawFromVector(Angles);
+ M = XMMatrixRotationQuaternion(Q);
+
+ return M;
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMMATRIX XMMatrixRotationNormal
+(
+ FXMVECTOR NormalAxis,
+ FLOAT Angle
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR A;
+ XMVECTOR N0, N1;
+ XMVECTOR V0, V1, V2;
+ XMVECTOR R0, R1, R2;
+ XMVECTOR C0, C1, C2;
+ XMMATRIX M;
+ static CONST XMVECTORU32 SwizzleYZXW = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0W};
+ static CONST XMVECTORU32 SwizzleZXYW = {XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W};
+ static CONST XMVECTORU32 Permute0Z1Y1Z0X = {XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_0X};
+ static CONST XMVECTORU32 Permute0Y1X0Y1X = {XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1X};
+ static CONST XMVECTORU32 Permute0X1X1Y0W = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0W};
+ static CONST XMVECTORU32 Permute1Z0Y1W0W = {XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W, XM_PERMUTE_0W};
+ static CONST XMVECTORU32 Permute1X1Y0Z0W = {XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_0W};
+
+ FLOAT fSinAngle = sinf(Angle);
+ FLOAT fCosAngle = cosf(Angle);
+
+ A = XMVectorSet(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f);
+
+ C2 = XMVectorSplatZ(A);
+ C1 = XMVectorSplatY(A);
+ C0 = XMVectorSplatX(A);
+
+ N0 = XMVectorPermute(NormalAxis, NormalAxis, SwizzleYZXW.v);
+ N1 = XMVectorPermute(NormalAxis, NormalAxis, SwizzleZXYW.v);
+
+ V0 = XMVectorMultiply(C2, N0);
+ V0 = XMVectorMultiply(V0, N1);
+
+ R0 = XMVectorMultiply(C2, NormalAxis);
+ R0 = XMVectorMultiplyAdd(R0, NormalAxis, C1);
+
+ R1 = XMVectorMultiplyAdd(C0, NormalAxis, V0);
+ R2 = XMVectorNegativeMultiplySubtract(C0, NormalAxis, V0);
+
+ V0 = XMVectorSelect(A, R0, g_XMSelect1110.v);
+ V1 = XMVectorPermute(R1, R2, Permute0Z1Y1Z0X.v);
+ V2 = XMVectorPermute(R1, R2, Permute0Y1X0Y1X.v);
+
+ M.r[0] = XMVectorPermute(V0, V1, Permute0X1X1Y0W.v);
+ M.r[1] = XMVectorPermute(V0, V1, Permute1Z0Y1W0W.v);
+ M.r[2] = XMVectorPermute(V0, V2, Permute1X1Y0Z0W.v);
+ M.r[3] = g_XMIdentityR3.v;
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR N0, N1;
+ XMVECTOR V0, V1, V2;
+ XMVECTOR R0, R1, R2;
+ XMVECTOR C0, C1, C2;
+ XMMATRIX M;
+
+ FLOAT fSinAngle = sinf(Angle);
+ FLOAT fCosAngle = cosf(Angle);
+
+ C2 = _mm_set_ps1(1.0f - fCosAngle);
+ C1 = _mm_set_ps1(fCosAngle);
+ C0 = _mm_set_ps1(fSinAngle);
+
+ N0 = _mm_shuffle_ps(NormalAxis,NormalAxis,_MM_SHUFFLE(3,0,2,1));
+// N0 = XMVectorPermute(NormalAxis, NormalAxis, SwizzleYZXW);
+ N1 = _mm_shuffle_ps(NormalAxis,NormalAxis,_MM_SHUFFLE(3,1,0,2));
+// N1 = XMVectorPermute(NormalAxis, NormalAxis, SwizzleZXYW);
+
+ V0 = _mm_mul_ps(C2, N0);
+ V0 = _mm_mul_ps(V0, N1);
+
+ R0 = _mm_mul_ps(C2, NormalAxis);
+ R0 = _mm_mul_ps(R0, NormalAxis);
+ R0 = _mm_add_ps(R0, C1);
+
+ R1 = _mm_mul_ps(C0, NormalAxis);
+ R1 = _mm_add_ps(R1, V0);
+ R2 = _mm_mul_ps(C0, NormalAxis);
+ R2 = _mm_sub_ps(V0,R2);
+
+ V0 = _mm_and_ps(R0,g_XMMask3);
+// V0 = XMVectorSelect(A, R0, g_XMSelect1110);
+ V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,1,2,0));
+ V1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(0,3,2,1));
+// V1 = XMVectorPermute(R1, R2, Permute0Z1Y1Z0X);
+ V2 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(0,0,1,1));
+ V2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(2,0,2,0));
+// V2 = XMVectorPermute(R1, R2, Permute0Y1X0Y1X);
+
+ R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(1,0,3,0));
+ R2 = _mm_shuffle_ps(R2,R2,_MM_SHUFFLE(1,3,2,0));
+ M.r[0] = R2;
+// M.r[0] = XMVectorPermute(V0, V1, Permute0X1X1Y0W);
+ R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(3,2,3,1));
+ R2 = _mm_shuffle_ps(R2,R2,_MM_SHUFFLE(1,3,0,2));
+ M.r[1] = R2;
+// M.r[1] = XMVectorPermute(V0, V1, Permute1Z0Y1W0W);
+ V2 = _mm_shuffle_ps(V2,V0,_MM_SHUFFLE(3,2,1,0));
+// R2 = _mm_shuffle_ps(R2,R2,_MM_SHUFFLE(3,2,1,0));
+ M.r[2] = V2;
+// M.r[2] = XMVectorPermute(V0, V2, Permute1X1Y0Z0W);
+ M.r[3] = g_XMIdentityR3;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMMATRIX XMMatrixRotationAxis
+(
+ FXMVECTOR Axis,
+ FLOAT Angle
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Normal;
+ XMMATRIX M;
+
+ XMASSERT(!XMVector3Equal(Axis, XMVectorZero()));
+ XMASSERT(!XMVector3IsInfinite(Axis));
+
+ Normal = XMVector3Normalize(Axis);
+ M = XMMatrixRotationNormal(Normal, Angle);
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(!XMVector3Equal(Axis, XMVectorZero()));
+ XMASSERT(!XMVector3IsInfinite(Axis));
+ XMVECTOR Normal = XMVector3Normalize(Axis);
+ XMMATRIX M = XMMatrixRotationNormal(Normal, Angle);
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixRotationQuaternion
+(
+ FXMVECTOR Quaternion
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX M;
+ XMVECTOR Q0, Q1;
+ XMVECTOR V0, V1, V2;
+ XMVECTOR R0, R1, R2;
+ static CONST XMVECTOR Constant1110 = {1.0f, 1.0f, 1.0f, 0.0f};
+ static CONST XMVECTORU32 SwizzleXXYW = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W};
+ static CONST XMVECTORU32 SwizzleZYZW = {XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0W};
+ static CONST XMVECTORU32 SwizzleYZXW = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0W};
+ static CONST XMVECTORU32 Permute0Y0X0X1W = {XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_1W};
+ static CONST XMVECTORU32 Permute0Z0Z0Y1W = {XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1W};
+ static CONST XMVECTORU32 Permute0Y1X1Y0Z = {XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z};
+ static CONST XMVECTORU32 Permute0X1Z0X1Z = {XM_PERMUTE_0X, XM_PERMUTE_1Z, XM_PERMUTE_0X, XM_PERMUTE_1Z};
+ static CONST XMVECTORU32 Permute0X1X1Y0W = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0W};
+ static CONST XMVECTORU32 Permute1Z0Y1W0W = {XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W, XM_PERMUTE_0W};
+ static CONST XMVECTORU32 Permute1X1Y0Z0W = {XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_0W};
+
+ Q0 = XMVectorAdd(Quaternion, Quaternion);
+ Q1 = XMVectorMultiply(Quaternion, Q0);
+
+ V0 = XMVectorPermute(Q1, Constant1110, Permute0Y0X0X1W.v);
+ V1 = XMVectorPermute(Q1, Constant1110, Permute0Z0Z0Y1W.v);
+ R0 = XMVectorSubtract(Constant1110, V0);
+ R0 = XMVectorSubtract(R0, V1);
+
+ V0 = XMVectorPermute(Quaternion, Quaternion, SwizzleXXYW.v);
+ V1 = XMVectorPermute(Q0, Q0, SwizzleZYZW.v);
+ V0 = XMVectorMultiply(V0, V1);
+
+ V1 = XMVectorSplatW(Quaternion);
+ V2 = XMVectorPermute(Q0, Q0, SwizzleYZXW.v);
+ V1 = XMVectorMultiply(V1, V2);
+
+ R1 = XMVectorAdd(V0, V1);
+ R2 = XMVectorSubtract(V0, V1);
+
+ V0 = XMVectorPermute(R1, R2, Permute0Y1X1Y0Z.v);
+ V1 = XMVectorPermute(R1, R2, Permute0X1Z0X1Z.v);
+
+ M.r[0] = XMVectorPermute(R0, V0, Permute0X1X1Y0W.v);
+ M.r[1] = XMVectorPermute(R0, V0, Permute1Z0Y1W0W.v);
+ M.r[2] = XMVectorPermute(R0, V1, Permute1X1Y0Z0W.v);
+ M.r[3] = g_XMIdentityR3.v;
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+ XMVECTOR Q0, Q1;
+ XMVECTOR V0, V1, V2;
+ XMVECTOR R0, R1, R2;
+ static CONST XMVECTORF32 Constant1110 = {1.0f, 1.0f, 1.0f, 0.0f};
+
+ Q0 = _mm_add_ps(Quaternion,Quaternion);
+ Q1 = _mm_mul_ps(Quaternion,Q0);
+
+ V0 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(3,0,0,1));
+ V0 = _mm_and_ps(V0,g_XMMask3);
+// V0 = XMVectorPermute(Q1, Constant1110,Permute0Y0X0X1W);
+ V1 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(3,1,2,2));
+ V1 = _mm_and_ps(V1,g_XMMask3);
+// V1 = XMVectorPermute(Q1, Constant1110,Permute0Z0Z0Y1W);
+ R0 = _mm_sub_ps(Constant1110,V0);
+ R0 = _mm_sub_ps(R0, V1);
+
+ V0 = _mm_shuffle_ps(Quaternion,Quaternion,_MM_SHUFFLE(3,1,0,0));
+// V0 = XMVectorPermute(Quaternion, Quaternion,SwizzleXXYW);
+ V1 = _mm_shuffle_ps(Q0,Q0,_MM_SHUFFLE(3,2,1,2));
+// V1 = XMVectorPermute(Q0, Q0,SwizzleZYZW);
+ V0 = _mm_mul_ps(V0, V1);
+
+ V1 = _mm_shuffle_ps(Quaternion,Quaternion,_MM_SHUFFLE(3,3,3,3));
+// V1 = XMVectorSplatW(Quaternion);
+ V2 = _mm_shuffle_ps(Q0,Q0,_MM_SHUFFLE(3,0,2,1));
+// V2 = XMVectorPermute(Q0, Q0,SwizzleYZXW);
+ V1 = _mm_mul_ps(V1, V2);
+
+ R1 = _mm_add_ps(V0, V1);
+ R2 = _mm_sub_ps(V0, V1);
+
+ V0 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(1,0,2,1));
+ V0 = _mm_shuffle_ps(V0,V0,_MM_SHUFFLE(1,3,2,0));
+// V0 = XMVectorPermute(R1, R2,Permute0Y1X1Y0Z);
+ V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,2,0,0));
+ V1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(2,0,2,0));
+// V1 = XMVectorPermute(R1, R2,Permute0X1Z0X1Z);
+
+ Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(1,0,3,0));
+ Q1 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(1,3,2,0));
+ M.r[0] = Q1;
+// M.r[0] = XMVectorPermute(R0, V0,Permute0X1X1Y0W);
+ Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(3,2,3,1));
+ Q1 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(1,3,0,2));
+ M.r[1] = Q1;
+// M.r[1] = XMVectorPermute(R0, V0,Permute1Z0Y1W0W);
+ Q1 = _mm_shuffle_ps(V1,R0,_MM_SHUFFLE(3,2,1,0));
+ M.r[2] = Q1;
+// M.r[2] = XMVectorPermute(R0, V1,Permute1X1Y0Z0W);
+ M.r[3] = g_XMIdentityR3;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMMATRIX XMMatrixTransformation2D
+(
+ FXMVECTOR ScalingOrigin,
+ FLOAT ScalingOrientation,
+ FXMVECTOR Scaling,
+ FXMVECTOR RotationOrigin,
+ FLOAT Rotation,
+ CXMVECTOR Translation
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX M;
+ XMVECTOR VScaling;
+ XMVECTOR NegScalingOrigin;
+ XMVECTOR VScalingOrigin;
+ XMMATRIX MScalingOriginI;
+ XMMATRIX MScalingOrientation;
+ XMMATRIX MScalingOrientationT;
+ XMMATRIX MScaling;
+ XMVECTOR VRotationOrigin;
+ XMMATRIX MRotation;
+ XMVECTOR VTranslation;
+
+ // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation *
+ // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+ VScalingOrigin = XMVectorSelect(g_XMSelect1100.v, ScalingOrigin, g_XMSelect1100.v);
+ NegScalingOrigin = XMVectorNegate(VScalingOrigin);
+
+ MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin);
+ MScalingOrientation = XMMatrixRotationZ(ScalingOrientation);
+ MScalingOrientationT = XMMatrixTranspose(MScalingOrientation);
+ VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v);
+ MScaling = XMMatrixScalingFromVector(VScaling);
+ VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v);
+ MRotation = XMMatrixRotationZ(Rotation);
+ VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v);
+
+ M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT);
+ M = XMMatrixMultiply(M, MScaling);
+ M = XMMatrixMultiply(M, MScalingOrientation);
+ M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin);
+ M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
+ M = XMMatrixMultiply(M, MRotation);
+ M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
+ M.r[3] = XMVectorAdd(M.r[3], VTranslation);
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+ XMVECTOR VScaling;
+ XMVECTOR NegScalingOrigin;
+ XMVECTOR VScalingOrigin;
+ XMMATRIX MScalingOriginI;
+ XMMATRIX MScalingOrientation;
+ XMMATRIX MScalingOrientationT;
+ XMMATRIX MScaling;
+ XMVECTOR VRotationOrigin;
+ XMMATRIX MRotation;
+ XMVECTOR VTranslation;
+
+ // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation *
+ // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+ static const XMVECTORU32 Mask2 = {0xFFFFFFFF,0xFFFFFFFF,0,0};
+ static const XMVECTORF32 ZWOne = {0,0,1.0f,1.0f};
+
+ VScalingOrigin = _mm_and_ps(ScalingOrigin, Mask2);
+ NegScalingOrigin = XMVectorNegate(VScalingOrigin);
+
+ MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin);
+ MScalingOrientation = XMMatrixRotationZ(ScalingOrientation);
+ MScalingOrientationT = XMMatrixTranspose(MScalingOrientation);
+ VScaling = _mm_and_ps(Scaling, Mask2);
+ VScaling = _mm_or_ps(VScaling,ZWOne);
+ MScaling = XMMatrixScalingFromVector(VScaling);
+ VRotationOrigin = _mm_and_ps(RotationOrigin, Mask2);
+ MRotation = XMMatrixRotationZ(Rotation);
+ VTranslation = _mm_and_ps(Translation, Mask2);
+
+ M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT);
+ M = XMMatrixMultiply(M, MScaling);
+ M = XMMatrixMultiply(M, MScalingOrientation);
+ M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin);
+ M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
+ M = XMMatrixMultiply(M, MRotation);
+ M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
+ M.r[3] = XMVectorAdd(M.r[3], VTranslation);
+
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMMATRIX XMMatrixTransformation
+(
+ FXMVECTOR ScalingOrigin,
+ FXMVECTOR ScalingOrientationQuaternion,
+ FXMVECTOR Scaling,
+ CXMVECTOR RotationOrigin,
+ CXMVECTOR RotationQuaternion,
+ CXMVECTOR Translation
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX M;
+ XMVECTOR NegScalingOrigin;
+ XMVECTOR VScalingOrigin;
+ XMMATRIX MScalingOriginI;
+ XMMATRIX MScalingOrientation;
+ XMMATRIX MScalingOrientationT;
+ XMMATRIX MScaling;
+ XMVECTOR VRotationOrigin;
+ XMMATRIX MRotation;
+ XMVECTOR VTranslation;
+
+ // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation *
+ // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+ VScalingOrigin = XMVectorSelect(g_XMSelect1110.v, ScalingOrigin, g_XMSelect1110.v);
+ NegScalingOrigin = XMVectorNegate(ScalingOrigin);
+
+ MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin);
+ MScalingOrientation = XMMatrixRotationQuaternion(ScalingOrientationQuaternion);
+ MScalingOrientationT = XMMatrixTranspose(MScalingOrientation);
+ MScaling = XMMatrixScalingFromVector(Scaling);
+ VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v);
+ MRotation = XMMatrixRotationQuaternion(RotationQuaternion);
+ VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v);
+
+ M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT);
+ M = XMMatrixMultiply(M, MScaling);
+ M = XMMatrixMultiply(M, MScalingOrientation);
+ M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin);
+ M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
+ M = XMMatrixMultiply(M, MRotation);
+ M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
+ M.r[3] = XMVectorAdd(M.r[3], VTranslation);
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+ XMVECTOR NegScalingOrigin;
+ XMVECTOR VScalingOrigin;
+ XMMATRIX MScalingOriginI;
+ XMMATRIX MScalingOrientation;
+ XMMATRIX MScalingOrientationT;
+ XMMATRIX MScaling;
+ XMVECTOR VRotationOrigin;
+ XMMATRIX MRotation;
+ XMVECTOR VTranslation;
+
+ // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation *
+ // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+ VScalingOrigin = _mm_and_ps(ScalingOrigin,g_XMMask3);
+ NegScalingOrigin = XMVectorNegate(ScalingOrigin);
+
+ MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin);
+ MScalingOrientation = XMMatrixRotationQuaternion(ScalingOrientationQuaternion);
+ MScalingOrientationT = XMMatrixTranspose(MScalingOrientation);
+ MScaling = XMMatrixScalingFromVector(Scaling);
+ VRotationOrigin = _mm_and_ps(RotationOrigin,g_XMMask3);
+ MRotation = XMMatrixRotationQuaternion(RotationQuaternion);
+ VTranslation = _mm_and_ps(Translation,g_XMMask3);
+
+ M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT);
+ M = XMMatrixMultiply(M, MScaling);
+ M = XMMatrixMultiply(M, MScalingOrientation);
+ M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin);
+ M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
+ M = XMMatrixMultiply(M, MRotation);
+ M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
+ M.r[3] = XMVectorAdd(M.r[3], VTranslation);
+
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMMATRIX XMMatrixAffineTransformation2D
+(
+ FXMVECTOR Scaling,
+ FXMVECTOR RotationOrigin,
+ FLOAT Rotation,
+ FXMVECTOR Translation
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX M;
+ XMVECTOR VScaling;
+ XMMATRIX MScaling;
+ XMVECTOR VRotationOrigin;
+ XMMATRIX MRotation;
+ XMVECTOR VTranslation;
+
+ // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+ VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v);
+ MScaling = XMMatrixScalingFromVector(VScaling);
+ VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v);
+ MRotation = XMMatrixRotationZ(Rotation);
+ VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v);
+
+ M = MScaling;
+ M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
+ M = XMMatrixMultiply(M, MRotation);
+ M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
+ M.r[3] = XMVectorAdd(M.r[3], VTranslation);
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+ XMVECTOR VScaling;
+ XMMATRIX MScaling;
+ XMVECTOR VRotationOrigin;
+ XMMATRIX MRotation;
+ XMVECTOR VTranslation;
+ static const XMVECTORU32 Mask2 = {0xFFFFFFFFU,0xFFFFFFFFU,0,0};
+ static const XMVECTORF32 ZW1 = {0,0,1.0f,1.0f};
+
+ // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+ VScaling = _mm_and_ps(Scaling, Mask2);
+ VScaling = _mm_or_ps(VScaling, ZW1);
+ MScaling = XMMatrixScalingFromVector(VScaling);
+ VRotationOrigin = _mm_and_ps(RotationOrigin, Mask2);
+ MRotation = XMMatrixRotationZ(Rotation);
+ VTranslation = _mm_and_ps(Translation, Mask2);
+
+ M = MScaling;
+ M.r[3] = _mm_sub_ps(M.r[3], VRotationOrigin);
+ M = XMMatrixMultiply(M, MRotation);
+ M.r[3] = _mm_add_ps(M.r[3], VRotationOrigin);
+ M.r[3] = _mm_add_ps(M.r[3], VTranslation);
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMMATRIX XMMatrixAffineTransformation
+(
+ FXMVECTOR Scaling,
+ FXMVECTOR RotationOrigin,
+ FXMVECTOR RotationQuaternion,
+ CXMVECTOR Translation
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX M;
+ XMMATRIX MScaling;
+ XMVECTOR VRotationOrigin;
+ XMMATRIX MRotation;
+ XMVECTOR VTranslation;
+
+ // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+ MScaling = XMMatrixScalingFromVector(Scaling);
+ VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin,g_XMSelect1110.v);
+ MRotation = XMMatrixRotationQuaternion(RotationQuaternion);
+ VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation,g_XMSelect1110.v);
+
+ M = MScaling;
+ M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
+ M = XMMatrixMultiply(M, MRotation);
+ M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
+ M.r[3] = XMVectorAdd(M.r[3], VTranslation);
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+ XMMATRIX MScaling;
+ XMVECTOR VRotationOrigin;
+ XMMATRIX MRotation;
+ XMVECTOR VTranslation;
+
+ // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+ MScaling = XMMatrixScalingFromVector(Scaling);
+ VRotationOrigin = _mm_and_ps(RotationOrigin,g_XMMask3);
+ MRotation = XMMatrixRotationQuaternion(RotationQuaternion);
+ VTranslation = _mm_and_ps(Translation,g_XMMask3);
+
+ M = MScaling;
+ M.r[3] = _mm_sub_ps(M.r[3], VRotationOrigin);
+ M = XMMatrixMultiply(M, MRotation);
+ M.r[3] = _mm_add_ps(M.r[3], VRotationOrigin);
+ M.r[3] = _mm_add_ps(M.r[3], VTranslation);
+
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixReflect
+(
+ FXMVECTOR ReflectionPlane
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR P;
+ XMVECTOR S;
+ XMVECTOR A, B, C, D;
+ XMMATRIX M;
+ static CONST XMVECTOR NegativeTwo = {-2.0f, -2.0f, -2.0f, 0.0f};
+
+ XMASSERT(!XMVector3Equal(ReflectionPlane, XMVectorZero()));
+ XMASSERT(!XMPlaneIsInfinite(ReflectionPlane));
+
+ P = XMPlaneNormalize(ReflectionPlane);
+ S = XMVectorMultiply(P, NegativeTwo);
+
+ A = XMVectorSplatX(P);
+ B = XMVectorSplatY(P);
+ C = XMVectorSplatZ(P);
+ D = XMVectorSplatW(P);
+
+ M.r[0] = XMVectorMultiplyAdd(A, S, g_XMIdentityR0.v);
+ M.r[1] = XMVectorMultiplyAdd(B, S, g_XMIdentityR1.v);
+ M.r[2] = XMVectorMultiplyAdd(C, S, g_XMIdentityR2.v);
+ M.r[3] = XMVectorMultiplyAdd(D, S, g_XMIdentityR3.v);
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+ static CONST XMVECTORF32 NegativeTwo = {-2.0f, -2.0f, -2.0f, 0.0f};
+
+ XMASSERT(!XMVector3Equal(ReflectionPlane, XMVectorZero()));
+ XMASSERT(!XMPlaneIsInfinite(ReflectionPlane));
+
+ XMVECTOR P = XMPlaneNormalize(ReflectionPlane);
+ XMVECTOR S = _mm_mul_ps(P,NegativeTwo);
+ XMVECTOR X = _mm_shuffle_ps(P,P,_MM_SHUFFLE(0,0,0,0));
+ XMVECTOR Y = _mm_shuffle_ps(P,P,_MM_SHUFFLE(1,1,1,1));
+ XMVECTOR Z = _mm_shuffle_ps(P,P,_MM_SHUFFLE(2,2,2,2));
+ P = _mm_shuffle_ps(P,P,_MM_SHUFFLE(3,3,3,3));
+ X = _mm_mul_ps(X,S);
+ Y = _mm_mul_ps(Y,S);
+ Z = _mm_mul_ps(Z,S);
+ P = _mm_mul_ps(P,S);
+ X = _mm_add_ps(X,g_XMIdentityR0);
+ Y = _mm_add_ps(Y,g_XMIdentityR1);
+ Z = _mm_add_ps(Z,g_XMIdentityR2);
+ P = _mm_add_ps(P,g_XMIdentityR3);
+ M.r[0] = X;
+ M.r[1] = Y;
+ M.r[2] = Z;
+ M.r[3] = P;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixShadow
+(
+ FXMVECTOR ShadowPlane,
+ FXMVECTOR LightPosition
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR P;
+ XMVECTOR Dot;
+ XMVECTOR A, B, C, D;
+ XMMATRIX M;
+ static CONST XMVECTORU32 Select0001 = {XM_SELECT_0, XM_SELECT_0, XM_SELECT_0, XM_SELECT_1};
+
+ XMASSERT(!XMVector3Equal(ShadowPlane, XMVectorZero()));
+ XMASSERT(!XMPlaneIsInfinite(ShadowPlane));
+
+ P = XMPlaneNormalize(ShadowPlane);
+ Dot = XMPlaneDot(P, LightPosition);
+ P = XMVectorNegate(P);
+ D = XMVectorSplatW(P);
+ C = XMVectorSplatZ(P);
+ B = XMVectorSplatY(P);
+ A = XMVectorSplatX(P);
+ Dot = XMVectorSelect(Select0001.v, Dot, Select0001.v);
+ M.r[3] = XMVectorMultiplyAdd(D, LightPosition, Dot);
+ Dot = XMVectorRotateLeft(Dot, 1);
+ M.r[2] = XMVectorMultiplyAdd(C, LightPosition, Dot);
+ Dot = XMVectorRotateLeft(Dot, 1);
+ M.r[1] = XMVectorMultiplyAdd(B, LightPosition, Dot);
+ Dot = XMVectorRotateLeft(Dot, 1);
+ M.r[0] = XMVectorMultiplyAdd(A, LightPosition, Dot);
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+ XMASSERT(!XMVector3Equal(ShadowPlane, XMVectorZero()));
+ XMASSERT(!XMPlaneIsInfinite(ShadowPlane));
+ XMVECTOR P = XMPlaneNormalize(ShadowPlane);
+ XMVECTOR Dot = XMPlaneDot(P,LightPosition);
+ // Negate
+ P = _mm_mul_ps(P,g_XMNegativeOne);
+ XMVECTOR X = _mm_shuffle_ps(P,P,_MM_SHUFFLE(0,0,0,0));
+ XMVECTOR Y = _mm_shuffle_ps(P,P,_MM_SHUFFLE(1,1,1,1));
+ XMVECTOR Z = _mm_shuffle_ps(P,P,_MM_SHUFFLE(2,2,2,2));
+ P = _mm_shuffle_ps(P,P,_MM_SHUFFLE(3,3,3,3));
+ Dot = _mm_and_ps(Dot,g_XMMaskW);
+ X = _mm_mul_ps(X,LightPosition);
+ Y = _mm_mul_ps(Y,LightPosition);
+ Z = _mm_mul_ps(Z,LightPosition);
+ P = _mm_mul_ps(P,LightPosition);
+ P = _mm_add_ps(P,Dot);
+ Dot = _mm_shuffle_ps(Dot,Dot,_MM_SHUFFLE(0,3,2,1));
+ Z = _mm_add_ps(Z,Dot);
+ Dot = _mm_shuffle_ps(Dot,Dot,_MM_SHUFFLE(0,3,2,1));
+ Y = _mm_add_ps(Y,Dot);
+ Dot = _mm_shuffle_ps(Dot,Dot,_MM_SHUFFLE(0,3,2,1));
+ X = _mm_add_ps(X,Dot);
+ // Store the resulting matrix
+ M.r[0] = X;
+ M.r[1] = Y;
+ M.r[2] = Z;
+ M.r[3] = P;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// View and projection initialization operations
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixLookAtLH
+(
+ FXMVECTOR EyePosition,
+ FXMVECTOR FocusPosition,
+ FXMVECTOR UpDirection
+)
+{
+ XMVECTOR EyeDirection;
+ XMMATRIX M;
+
+ EyeDirection = XMVectorSubtract(FocusPosition, EyePosition);
+ M = XMMatrixLookToLH(EyePosition, EyeDirection, UpDirection);
+
+ return M;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixLookAtRH
+(
+ FXMVECTOR EyePosition,
+ FXMVECTOR FocusPosition,
+ FXMVECTOR UpDirection
+)
+{
+ XMVECTOR NegEyeDirection;
+ XMMATRIX M;
+
+ NegEyeDirection = XMVectorSubtract(EyePosition, FocusPosition);
+ M = XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection);
+
+ return M;
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMMATRIX XMMatrixLookToLH
+(
+ FXMVECTOR EyePosition,
+ FXMVECTOR EyeDirection,
+ FXMVECTOR UpDirection
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR NegEyePosition;
+ XMVECTOR D0, D1, D2;
+ XMVECTOR R0, R1, R2;
+ XMMATRIX M;
+
+ XMASSERT(!XMVector3Equal(EyeDirection, XMVectorZero()));
+ XMASSERT(!XMVector3IsInfinite(EyeDirection));
+ XMASSERT(!XMVector3Equal(UpDirection, XMVectorZero()));
+ XMASSERT(!XMVector3IsInfinite(UpDirection));
+
+ R2 = XMVector3Normalize(EyeDirection);
+
+ R0 = XMVector3Cross(UpDirection, R2);
+ R0 = XMVector3Normalize(R0);
+
+ R1 = XMVector3Cross(R2, R0);
+
+ NegEyePosition = XMVectorNegate(EyePosition);
+
+ D0 = XMVector3Dot(R0, NegEyePosition);
+ D1 = XMVector3Dot(R1, NegEyePosition);
+ D2 = XMVector3Dot(R2, NegEyePosition);
+
+ M.r[0] = XMVectorSelect(D0, R0, g_XMSelect1110.v);
+ M.r[1] = XMVectorSelect(D1, R1, g_XMSelect1110.v);
+ M.r[2] = XMVectorSelect(D2, R2, g_XMSelect1110.v);
+ M.r[3] = g_XMIdentityR3.v;
+
+ M = XMMatrixTranspose(M);
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+
+ XMASSERT(!XMVector3Equal(EyeDirection, XMVectorZero()));
+ XMASSERT(!XMVector3IsInfinite(EyeDirection));
+ XMASSERT(!XMVector3Equal(UpDirection, XMVectorZero()));
+ XMASSERT(!XMVector3IsInfinite(UpDirection));
+
+ XMVECTOR R2 = XMVector3Normalize(EyeDirection);
+ XMVECTOR R0 = XMVector3Cross(UpDirection, R2);
+ R0 = XMVector3Normalize(R0);
+ XMVECTOR R1 = XMVector3Cross(R2,R0);
+ XMVECTOR NegEyePosition = _mm_mul_ps(EyePosition,g_XMNegativeOne);
+ XMVECTOR D0 = XMVector3Dot(R0,NegEyePosition);
+ XMVECTOR D1 = XMVector3Dot(R1,NegEyePosition);
+ XMVECTOR D2 = XMVector3Dot(R2,NegEyePosition);
+ R0 = _mm_and_ps(R0,g_XMMask3);
+ R1 = _mm_and_ps(R1,g_XMMask3);
+ R2 = _mm_and_ps(R2,g_XMMask3);
+ D0 = _mm_and_ps(D0,g_XMMaskW);
+ D1 = _mm_and_ps(D1,g_XMMaskW);
+ D2 = _mm_and_ps(D2,g_XMMaskW);
+ D0 = _mm_or_ps(D0,R0);
+ D1 = _mm_or_ps(D1,R1);
+ D2 = _mm_or_ps(D2,R2);
+ M.r[0] = D0;
+ M.r[1] = D1;
+ M.r[2] = D2;
+ M.r[3] = g_XMIdentityR3;
+ M = XMMatrixTranspose(M);
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixLookToRH
+(
+ FXMVECTOR EyePosition,
+ FXMVECTOR EyeDirection,
+ FXMVECTOR UpDirection
+)
+{
+ XMVECTOR NegEyeDirection;
+ XMMATRIX M;
+
+ NegEyeDirection = XMVectorNegate(EyeDirection);
+ M = XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection);
+
+ return M;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixPerspectiveLH
+(
+ FLOAT ViewWidth,
+ FLOAT ViewHeight,
+ FLOAT NearZ,
+ FLOAT FarZ
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT TwoNearZ, fRange;
+ XMMATRIX M;
+
+ XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+ TwoNearZ = NearZ + NearZ;
+ fRange = FarZ / (FarZ - NearZ);
+ M.m[0][0] = TwoNearZ / ViewWidth;
+ M.m[0][1] = 0.0f;
+ M.m[0][2] = 0.0f;
+ M.m[0][3] = 0.0f;
+
+ M.m[1][0] = 0.0f;
+ M.m[1][1] = TwoNearZ / ViewHeight;
+ M.m[1][2] = 0.0f;
+ M.m[1][3] = 0.0f;
+
+ M.m[2][0] = 0.0f;
+ M.m[2][1] = 0.0f;
+ M.m[2][2] = fRange;
+ M.m[2][3] = 1.0f;
+
+ M.m[3][0] = 0.0f;
+ M.m[3][1] = 0.0f;
+ M.m[3][2] = -fRange * NearZ;
+ M.m[3][3] = 0.0f;
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+ XMMATRIX M;
+ FLOAT TwoNearZ = NearZ + NearZ;
+ FLOAT fRange = FarZ / (FarZ - NearZ);
+ // Note: This is recorded on the stack
+ XMVECTOR rMem = {
+ TwoNearZ / ViewWidth,
+ TwoNearZ / ViewHeight,
+ fRange,
+ -fRange * NearZ
+ };
+ // Copy from memory to SSE register
+ XMVECTOR vValues = rMem;
+ XMVECTOR vTemp = _mm_setzero_ps();
+ // Copy x only
+ vTemp = _mm_move_ss(vTemp,vValues);
+ // TwoNearZ / ViewWidth,0,0,0
+ M.r[0] = vTemp;
+ // 0,TwoNearZ / ViewHeight,0,0
+ vTemp = vValues;
+ vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+ M.r[1] = vTemp;
+ // x=fRange,y=-fRange * NearZ,0,1.0f
+ vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2));
+ // 0,0,fRange,1.0f
+ vTemp = _mm_setzero_ps();
+ vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0));
+ M.r[2] = vTemp;
+ // 0,0,-fRange * NearZ,0
+ vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0));
+ M.r[3] = vTemp;
+
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixPerspectiveRH
+(
+ FLOAT ViewWidth,
+ FLOAT ViewHeight,
+ FLOAT NearZ,
+ FLOAT FarZ
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT TwoNearZ, fRange;
+ XMMATRIX M;
+
+ XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+ TwoNearZ = NearZ + NearZ;
+ fRange = FarZ / (NearZ - FarZ);
+ M.m[0][0] = TwoNearZ / ViewWidth;
+ M.m[0][1] = 0.0f;
+ M.m[0][2] = 0.0f;
+ M.m[0][3] = 0.0f;
+
+ M.m[1][0] = 0.0f;
+ M.m[1][1] = TwoNearZ / ViewHeight;
+ M.m[1][2] = 0.0f;
+ M.m[1][3] = 0.0f;
+
+ M.m[2][0] = 0.0f;
+ M.m[2][1] = 0.0f;
+ M.m[2][2] = fRange;
+ M.m[2][3] = -1.0f;
+
+ M.m[3][0] = 0.0f;
+ M.m[3][1] = 0.0f;
+ M.m[3][2] = fRange * NearZ;
+ M.m[3][3] = 0.0f;
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+ XMMATRIX M;
+ FLOAT TwoNearZ = NearZ + NearZ;
+ FLOAT fRange = FarZ / (NearZ-FarZ);
+ // Note: This is recorded on the stack
+ XMVECTOR rMem = {
+ TwoNearZ / ViewWidth,
+ TwoNearZ / ViewHeight,
+ fRange,
+ fRange * NearZ
+ };
+ // Copy from memory to SSE register
+ XMVECTOR vValues = rMem;
+ XMVECTOR vTemp = _mm_setzero_ps();
+ // Copy x only
+ vTemp = _mm_move_ss(vTemp,vValues);
+ // TwoNearZ / ViewWidth,0,0,0
+ M.r[0] = vTemp;
+ // 0,TwoNearZ / ViewHeight,0,0
+ vTemp = vValues;
+ vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+ M.r[1] = vTemp;
+ // x=fRange,y=-fRange * NearZ,0,-1.0f
+ vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2));
+ // 0,0,fRange,-1.0f
+ vTemp = _mm_setzero_ps();
+ vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0));
+ M.r[2] = vTemp;
+ // 0,0,-fRange * NearZ,0
+ vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0));
+ M.r[3] = vTemp;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH
+(
+ FLOAT FovAngleY,
+ FLOAT AspectHByW,
+ FLOAT NearZ,
+ FLOAT FarZ
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT SinFov;
+ FLOAT CosFov;
+ FLOAT Height;
+ FLOAT Width;
+ XMMATRIX M;
+
+ XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
+ XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+ XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+
+ Height = CosFov / SinFov;
+ Width = Height / AspectHByW;
+
+ M.r[0] = XMVectorSet(Width, 0.0f, 0.0f, 0.0f);
+ M.r[1] = XMVectorSet(0.0f, Height, 0.0f, 0.0f);
+ M.r[2] = XMVectorSet(0.0f, 0.0f, FarZ / (FarZ - NearZ), 1.0f);
+ M.r[3] = XMVectorSet(0.0f, 0.0f, -M.r[2].vector4_f32[2] * NearZ, 0.0f);
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
+ XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+ XMMATRIX M;
+ FLOAT SinFov;
+ FLOAT CosFov;
+ XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+ FLOAT fRange = FarZ / (FarZ-NearZ);
+ // Note: This is recorded on the stack
+ FLOAT Height = CosFov / SinFov;
+ XMVECTOR rMem = {
+ Height / AspectHByW,
+ Height,
+ fRange,
+ -fRange * NearZ
+ };
+ // Copy from memory to SSE register
+ XMVECTOR vValues = rMem;
+ XMVECTOR vTemp = _mm_setzero_ps();
+ // Copy x only
+ vTemp = _mm_move_ss(vTemp,vValues);
+ // CosFov / SinFov,0,0,0
+ M.r[0] = vTemp;
+ // 0,Height / AspectHByW,0,0
+ vTemp = vValues;
+ vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+ M.r[1] = vTemp;
+ // x=fRange,y=-fRange * NearZ,0,1.0f
+ vTemp = _mm_setzero_ps();
+ vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2));
+ // 0,0,fRange,1.0f
+ vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0));
+ M.r[2] = vTemp;
+ // 0,0,-fRange * NearZ,0.0f
+ vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0));
+ M.r[3] = vTemp;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH
+(
+ FLOAT FovAngleY,
+ FLOAT AspectHByW,
+ FLOAT NearZ,
+ FLOAT FarZ
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT SinFov;
+ FLOAT CosFov;
+ FLOAT Height;
+ FLOAT Width;
+ XMMATRIX M;
+
+ XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
+ XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+ XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+
+ Height = CosFov / SinFov;
+ Width = Height / AspectHByW;
+
+ M.r[0] = XMVectorSet(Width, 0.0f, 0.0f, 0.0f);
+ M.r[1] = XMVectorSet(0.0f, Height, 0.0f, 0.0f);
+ M.r[2] = XMVectorSet(0.0f, 0.0f, FarZ / (NearZ - FarZ), -1.0f);
+ M.r[3] = XMVectorSet(0.0f, 0.0f, M.r[2].vector4_f32[2] * NearZ, 0.0f);
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
+ XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+ XMMATRIX M;
+ FLOAT SinFov;
+ FLOAT CosFov;
+ XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+ FLOAT fRange = FarZ / (NearZ-FarZ);
+ // Note: This is recorded on the stack
+ FLOAT Height = CosFov / SinFov;
+ XMVECTOR rMem = {
+ Height / AspectHByW,
+ Height,
+ fRange,
+ fRange * NearZ
+ };
+ // Copy from memory to SSE register
+ XMVECTOR vValues = rMem;
+ XMVECTOR vTemp = _mm_setzero_ps();
+ // Copy x only
+ vTemp = _mm_move_ss(vTemp,vValues);
+ // CosFov / SinFov,0,0,0
+ M.r[0] = vTemp;
+ // 0,Height / AspectHByW,0,0
+ vTemp = vValues;
+ vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+ M.r[1] = vTemp;
+ // x=fRange,y=-fRange * NearZ,0,-1.0f
+ vTemp = _mm_setzero_ps();
+ vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2));
+ // 0,0,fRange,-1.0f
+ vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0));
+ M.r[2] = vTemp;
+ // 0,0,fRange * NearZ,0.0f
+ vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0));
+ M.r[3] = vTemp;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixPerspectiveOffCenterLH
+(
+ FLOAT ViewLeft,
+ FLOAT ViewRight,
+ FLOAT ViewBottom,
+ FLOAT ViewTop,
+ FLOAT NearZ,
+ FLOAT FarZ
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT TwoNearZ;
+ FLOAT ReciprocalWidth;
+ FLOAT ReciprocalHeight;
+ FLOAT fRange;
+ XMMATRIX M;
+
+ XMASSERT(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+ TwoNearZ = NearZ + NearZ;
+ ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+ ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+ fRange = FarZ / (FarZ-NearZ);
+
+ M.m[0][0] = TwoNearZ * ReciprocalWidth;
+ M.m[0][1] = 0.0f;
+ M.m[0][2] = 0.0f;
+ M.m[0][3] = 0.0f;
+
+ M.m[1][0] = 0.0f;
+ M.m[1][1] = TwoNearZ * ReciprocalHeight;
+ M.m[1][2] = 0.0f;
+ M.m[1][3] = 0.0f;
+
+ M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth;
+ M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight;
+ M.m[2][2] = fRange;
+ M.m[2][3] = 1.0f;
+
+ M.m[3][0] = 0.0f;
+ M.m[3][1] = 0.0f;
+ M.m[3][2] = -fRange * NearZ;
+ M.m[3][3] = 0.0f;
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+ XMMATRIX M;
+ FLOAT TwoNearZ = NearZ+NearZ;
+ FLOAT ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+ FLOAT ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+ FLOAT fRange = FarZ / (FarZ-NearZ);
+ // Note: This is recorded on the stack
+ XMVECTOR rMem = {
+ TwoNearZ*ReciprocalWidth,
+ TwoNearZ*ReciprocalHeight,
+ -fRange * NearZ,
+ 0
+ };
+ // Copy from memory to SSE register
+ XMVECTOR vValues = rMem;
+ XMVECTOR vTemp = _mm_setzero_ps();
+ // Copy x only
+ vTemp = _mm_move_ss(vTemp,vValues);
+ // TwoNearZ*ReciprocalWidth,0,0,0
+ M.r[0] = vTemp;
+ // 0,TwoNearZ*ReciprocalHeight,0,0
+ vTemp = vValues;
+ vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+ M.r[1] = vTemp;
+ // 0,0,fRange,1.0f
+ M.r[2] = XMVectorSet( -(ViewLeft + ViewRight) * ReciprocalWidth,
+ -(ViewTop + ViewBottom) * ReciprocalHeight,
+ fRange,
+ 1.0f );
+ // 0,0,-fRange * NearZ,0.0f
+ vValues = _mm_and_ps(vValues,g_XMMaskZ);
+ M.r[3] = vValues;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixPerspectiveOffCenterRH
+(
+ FLOAT ViewLeft,
+ FLOAT ViewRight,
+ FLOAT ViewBottom,
+ FLOAT ViewTop,
+ FLOAT NearZ,
+ FLOAT FarZ
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT TwoNearZ;
+ FLOAT ReciprocalWidth;
+ FLOAT ReciprocalHeight;
+ FLOAT fRange;
+ XMMATRIX M;
+
+ XMASSERT(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+ TwoNearZ = NearZ + NearZ;
+ ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+ ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+ fRange = FarZ / (NearZ-FarZ);
+
+ M.m[0][0] = TwoNearZ * ReciprocalWidth;
+ M.m[0][1] = 0.0f;
+ M.m[0][2] = 0.0f;
+ M.m[0][3] = 0.0f;
+
+ M.m[1][0] = 0.0f;
+ M.m[1][1] = TwoNearZ * ReciprocalHeight;
+ M.m[1][2] = 0.0f;
+ M.m[1][3] = 0.0f;
+
+ M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth;
+ M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight;
+ M.m[2][2] = fRange;
+ M.m[2][3] = -1.0f;
+
+ M.m[3][0] = 0.0f;
+ M.m[3][1] = 0.0f;
+ M.m[3][2] = fRange * NearZ;
+ M.m[3][3] = 0.0f;
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+ XMMATRIX M;
+ FLOAT TwoNearZ = NearZ+NearZ;
+ FLOAT ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+ FLOAT ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+ FLOAT fRange = FarZ / (NearZ-FarZ);
+ // Note: This is recorded on the stack
+ XMVECTOR rMem = {
+ TwoNearZ*ReciprocalWidth,
+ TwoNearZ*ReciprocalHeight,
+ fRange * NearZ,
+ 0
+ };
+ // Copy from memory to SSE register
+ XMVECTOR vValues = rMem;
+ XMVECTOR vTemp = _mm_setzero_ps();
+ // Copy x only
+ vTemp = _mm_move_ss(vTemp,vValues);
+ // TwoNearZ*ReciprocalWidth,0,0,0
+ M.r[0] = vTemp;
+ // 0,TwoNearZ*ReciprocalHeight,0,0
+ vTemp = vValues;
+ vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+ M.r[1] = vTemp;
+ // 0,0,fRange,1.0f
+ M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth,
+ (ViewTop + ViewBottom) * ReciprocalHeight,
+ fRange,
+ -1.0f);
+ // 0,0,-fRange * NearZ,0.0f
+ vValues = _mm_and_ps(vValues,g_XMMaskZ);
+ M.r[3] = vValues;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixOrthographicLH
+(
+ FLOAT ViewWidth,
+ FLOAT ViewHeight,
+ FLOAT NearZ,
+ FLOAT FarZ
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT fRange;
+ XMMATRIX M;
+
+ XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+ fRange = 1.0f / (FarZ-NearZ);
+ M.r[0] = XMVectorSet(2.0f / ViewWidth, 0.0f, 0.0f, 0.0f);
+ M.r[1] = XMVectorSet(0.0f, 2.0f / ViewHeight, 0.0f, 0.0f);
+ M.r[2] = XMVectorSet(0.0f, 0.0f, fRange, 0.0f);
+ M.r[3] = XMVectorSet(0.0f, 0.0f, -fRange * NearZ, 1.0f);
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+ XMMATRIX M;
+ FLOAT fRange = 1.0f / (FarZ-NearZ);
+ // Note: This is recorded on the stack
+ XMVECTOR rMem = {
+ 2.0f / ViewWidth,
+ 2.0f / ViewHeight,
+ fRange,
+ -fRange * NearZ
+ };
+ // Copy from memory to SSE register
+ XMVECTOR vValues = rMem;
+ XMVECTOR vTemp = _mm_setzero_ps();
+ // Copy x only
+ vTemp = _mm_move_ss(vTemp,vValues);
+ // 2.0f / ViewWidth,0,0,0
+ M.r[0] = vTemp;
+ // 0,2.0f / ViewHeight,0,0
+ vTemp = vValues;
+ vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+ M.r[1] = vTemp;
+ // x=fRange,y=-fRange * NearZ,0,1.0f
+ vTemp = _mm_setzero_ps();
+ vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2));
+ // 0,0,fRange,0.0f
+ vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0));
+ M.r[2] = vTemp;
+ // 0,0,-fRange * NearZ,1.0f
+ vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0));
+ M.r[3] = vTemp;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixOrthographicRH
+(
+ FLOAT ViewWidth,
+ FLOAT ViewHeight,
+ FLOAT NearZ,
+ FLOAT FarZ
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX M;
+
+ XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+ M.r[0] = XMVectorSet(2.0f / ViewWidth, 0.0f, 0.0f, 0.0f);
+ M.r[1] = XMVectorSet(0.0f, 2.0f / ViewHeight, 0.0f, 0.0f);
+ M.r[2] = XMVectorSet(0.0f, 0.0f, 1.0f / (NearZ - FarZ), 0.0f);
+ M.r[3] = XMVectorSet(0.0f, 0.0f, M.r[2].vector4_f32[2] * NearZ, 1.0f);
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+ XMMATRIX M;
+ FLOAT fRange = 1.0f / (NearZ-FarZ);
+ // Note: This is recorded on the stack
+ XMVECTOR rMem = {
+ 2.0f / ViewWidth,
+ 2.0f / ViewHeight,
+ fRange,
+ fRange * NearZ
+ };
+ // Copy from memory to SSE register
+ XMVECTOR vValues = rMem;
+ XMVECTOR vTemp = _mm_setzero_ps();
+ // Copy x only
+ vTemp = _mm_move_ss(vTemp,vValues);
+ // 2.0f / ViewWidth,0,0,0
+ M.r[0] = vTemp;
+ // 0,2.0f / ViewHeight,0,0
+ vTemp = vValues;
+ vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+ M.r[1] = vTemp;
+ // x=fRange,y=fRange * NearZ,0,1.0f
+ vTemp = _mm_setzero_ps();
+ vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2));
+ // 0,0,fRange,0.0f
+ vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0));
+ M.r[2] = vTemp;
+ // 0,0,fRange * NearZ,1.0f
+ vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0));
+ M.r[3] = vTemp;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixOrthographicOffCenterLH
+(
+ FLOAT ViewLeft,
+ FLOAT ViewRight,
+ FLOAT ViewBottom,
+ FLOAT ViewTop,
+ FLOAT NearZ,
+ FLOAT FarZ
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT ReciprocalWidth;
+ FLOAT ReciprocalHeight;
+ XMMATRIX M;
+
+ XMASSERT(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+ ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+ ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+
+ M.r[0] = XMVectorSet(ReciprocalWidth + ReciprocalWidth, 0.0f, 0.0f, 0.0f);
+ M.r[1] = XMVectorSet(0.0f, ReciprocalHeight + ReciprocalHeight, 0.0f, 0.0f);
+ M.r[2] = XMVectorSet(0.0f, 0.0f, 1.0f / (FarZ - NearZ), 0.0f);
+ M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
+ -(ViewTop + ViewBottom) * ReciprocalHeight,
+ -M.r[2].vector4_f32[2] * NearZ,
+ 1.0f);
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+ FLOAT fReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+ FLOAT fReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+ FLOAT fRange = 1.0f / (FarZ-NearZ);
+ // Note: This is recorded on the stack
+ XMVECTOR rMem = {
+ fReciprocalWidth,
+ fReciprocalHeight,
+ fRange,
+ 1.0f
+ };
+ XMVECTOR rMem2 = {
+ -(ViewLeft + ViewRight),
+ -(ViewTop + ViewBottom),
+ -NearZ,
+ 1.0f
+ };
+ // Copy from memory to SSE register
+ XMVECTOR vValues = rMem;
+ XMVECTOR vTemp = _mm_setzero_ps();
+ // Copy x only
+ vTemp = _mm_move_ss(vTemp,vValues);
+ // fReciprocalWidth*2,0,0,0
+ vTemp = _mm_add_ss(vTemp,vTemp);
+ M.r[0] = vTemp;
+ // 0,fReciprocalHeight*2,0,0
+ vTemp = vValues;
+ vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+ vTemp = _mm_add_ps(vTemp,vTemp);
+ M.r[1] = vTemp;
+ // 0,0,fRange,0.0f
+ vTemp = vValues;
+ vTemp = _mm_and_ps(vTemp,g_XMMaskZ);
+ M.r[2] = vTemp;
+ // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f
+ vValues = _mm_mul_ps(vValues,rMem2);
+ M.r[3] = vValues;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMMATRIX XMMatrixOrthographicOffCenterRH
+(
+ FLOAT ViewLeft,
+ FLOAT ViewRight,
+ FLOAT ViewBottom,
+ FLOAT ViewTop,
+ FLOAT NearZ,
+ FLOAT FarZ
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT ReciprocalWidth;
+ FLOAT ReciprocalHeight;
+ XMMATRIX M;
+
+ XMASSERT(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
+ XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+ ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+ ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+
+ M.r[0] = XMVectorSet(ReciprocalWidth + ReciprocalWidth, 0.0f, 0.0f, 0.0f);
+ M.r[1] = XMVectorSet(0.0f, ReciprocalHeight + ReciprocalHeight, 0.0f, 0.0f);
+ M.r[2] = XMVectorSet(0.0f, 0.0f, 1.0f / (NearZ - FarZ), 0.0f);
+ M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
+ -(ViewTop + ViewBottom) * ReciprocalHeight,
+ M.r[2].vector4_f32[2] * NearZ,
+ 1.0f);
+
+ return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX M;
+ FLOAT fReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+ FLOAT fReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+ FLOAT fRange = 1.0f / (NearZ-FarZ);
+ // Note: This is recorded on the stack
+ XMVECTOR rMem = {
+ fReciprocalWidth,
+ fReciprocalHeight,
+ fRange,
+ 1.0f
+ };
+ XMVECTOR rMem2 = {
+ -(ViewLeft + ViewRight),
+ -(ViewTop + ViewBottom),
+ NearZ,
+ 1.0f
+ };
+ // Copy from memory to SSE register
+ XMVECTOR vValues = rMem;
+ XMVECTOR vTemp = _mm_setzero_ps();
+ // Copy x only
+ vTemp = _mm_move_ss(vTemp,vValues);
+ // fReciprocalWidth*2,0,0,0
+ vTemp = _mm_add_ss(vTemp,vTemp);
+ M.r[0] = vTemp;
+ // 0,fReciprocalHeight*2,0,0
+ vTemp = vValues;
+ vTemp = _mm_and_ps(vTemp,g_XMMaskY);
+ vTemp = _mm_add_ps(vTemp,vTemp);
+ M.r[1] = vTemp;
+ // 0,0,fRange,0.0f
+ vTemp = vValues;
+ vTemp = _mm_and_ps(vTemp,g_XMMaskZ);
+ M.r[2] = vTemp;
+ // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f
+ vValues = _mm_mul_ps(vValues,rMem2);
+ M.r[3] = vValues;
+ return M;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+
+#ifdef __cplusplus
+
+/****************************************************************************
+ *
+ * XMMATRIX operators and methods
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMMATRIX::_XMMATRIX
+(
+ FXMVECTOR R0,
+ FXMVECTOR R1,
+ FXMVECTOR R2,
+ CXMVECTOR R3
+)
+{
+ r[0] = R0;
+ r[1] = R1;
+ r[2] = R2;
+ r[3] = R3;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMMATRIX::_XMMATRIX
+(
+ FLOAT m00, FLOAT m01, FLOAT m02, FLOAT m03,
+ FLOAT m10, FLOAT m11, FLOAT m12, FLOAT m13,
+ FLOAT m20, FLOAT m21, FLOAT m22, FLOAT m23,
+ FLOAT m30, FLOAT m31, FLOAT m32, FLOAT m33
+)
+{
+ r[0] = XMVectorSet(m00, m01, m02, m03);
+ r[1] = XMVectorSet(m10, m11, m12, m13);
+ r[2] = XMVectorSet(m20, m21, m22, m23);
+ r[3] = XMVectorSet(m30, m31, m32, m33);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMMATRIX::_XMMATRIX
+(
+ CONST FLOAT* pArray
+)
+{
+ r[0] = XMLoadFloat4((const XMFLOAT4*)pArray);
+ r[1] = XMLoadFloat4((const XMFLOAT4*)(pArray + 4));
+ r[2] = XMLoadFloat4((const XMFLOAT4*)(pArray + 8));
+ r[3] = XMLoadFloat4((const XMFLOAT4*)(pArray + 12));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMMATRIX& _XMMATRIX::operator=
+(
+ CONST _XMMATRIX& M
+)
+{
+ r[0] = M.r[0];
+ r[1] = M.r[1];
+ r[2] = M.r[2];
+ r[3] = M.r[3];
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+#ifndef XM_NO_OPERATOR_OVERLOADS
+
+#if !defined(_XBOX_VER) && defined(_XM_ISVS2005_) && defined(_XM_X64_)
+#pragma warning(push)
+#pragma warning(disable : 4328)
+#endif
+
+XMFINLINE _XMMATRIX& _XMMATRIX::operator*=
+(
+ CONST _XMMATRIX& M
+)
+{
+ *this = XMMatrixMultiply(*this, M);
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMMATRIX _XMMATRIX::operator*
+(
+ CONST _XMMATRIX& M
+) CONST
+{
+ return XMMatrixMultiply(*this, M);
+}
+
+#if !defined(_XBOX_VER) && defined(_XM_ISVS2005_) && defined(_XM_X64_)
+#pragma warning(pop)
+#endif
+
+#endif // !XM_NO_OPERATOR_OVERLOADS
+
+/****************************************************************************
+ *
+ * XMFLOAT3X3 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT3X3::_XMFLOAT3X3
+(
+ FLOAT m00, FLOAT m01, FLOAT m02,
+ FLOAT m10, FLOAT m11, FLOAT m12,
+ FLOAT m20, FLOAT m21, FLOAT m22
+)
+{
+ m[0][0] = m00;
+ m[0][1] = m01;
+ m[0][2] = m02;
+
+ m[1][0] = m10;
+ m[1][1] = m11;
+ m[1][2] = m12;
+
+ m[2][0] = m20;
+ m[2][1] = m21;
+ m[2][2] = m22;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT3X3::_XMFLOAT3X3
+(
+ CONST FLOAT* pArray
+)
+{
+ UINT Row;
+ UINT Column;
+
+ for (Row = 0; Row < 3; Row++)
+ {
+ for (Column = 0; Column < 3; Column++)
+ {
+ m[Row][Column] = pArray[Row * 3 + Column];
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT3X3& _XMFLOAT3X3::operator=
+(
+ CONST _XMFLOAT3X3& Float3x3
+)
+{
+ _11 = Float3x3._11;
+ _12 = Float3x3._12;
+ _13 = Float3x3._13;
+ _21 = Float3x3._21;
+ _22 = Float3x3._22;
+ _23 = Float3x3._23;
+ _31 = Float3x3._31;
+ _32 = Float3x3._32;
+ _33 = Float3x3._33;
+
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT4X3 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT4X3::_XMFLOAT4X3
+(
+ FLOAT m00, FLOAT m01, FLOAT m02,
+ FLOAT m10, FLOAT m11, FLOAT m12,
+ FLOAT m20, FLOAT m21, FLOAT m22,
+ FLOAT m30, FLOAT m31, FLOAT m32
+)
+{
+ m[0][0] = m00;
+ m[0][1] = m01;
+ m[0][2] = m02;
+
+ m[1][0] = m10;
+ m[1][1] = m11;
+ m[1][2] = m12;
+
+ m[2][0] = m20;
+ m[2][1] = m21;
+ m[2][2] = m22;
+
+ m[3][0] = m30;
+ m[3][1] = m31;
+ m[3][2] = m32;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT4X3::_XMFLOAT4X3
+(
+ CONST FLOAT* pArray
+)
+{
+ UINT Row;
+ UINT Column;
+
+ for (Row = 0; Row < 4; Row++)
+ {
+ for (Column = 0; Column < 3; Column++)
+ {
+ m[Row][Column] = pArray[Row * 3 + Column];
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT4X3& _XMFLOAT4X3::operator=
+(
+ CONST _XMFLOAT4X3& Float4x3
+)
+{
+ XMVECTOR V1 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._11);
+ XMVECTOR V2 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._22);
+ XMVECTOR V3 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._33);
+
+ XMStoreFloat4((XMFLOAT4*)&_11, V1);
+ XMStoreFloat4((XMFLOAT4*)&_22, V2);
+ XMStoreFloat4((XMFLOAT4*)&_33, V3);
+
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMFLOAT4X3A& XMFLOAT4X3A::operator=
+(
+ CONST XMFLOAT4X3A& Float4x3
+)
+{
+ XMVECTOR V1 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._11);
+ XMVECTOR V2 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._22);
+ XMVECTOR V3 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._33);
+
+ XMStoreFloat4A((XMFLOAT4A*)&_11, V1);
+ XMStoreFloat4A((XMFLOAT4A*)&_22, V2);
+ XMStoreFloat4A((XMFLOAT4A*)&_33, V3);
+
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT4X4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT4X4::_XMFLOAT4X4
+(
+ FLOAT m00, FLOAT m01, FLOAT m02, FLOAT m03,
+ FLOAT m10, FLOAT m11, FLOAT m12, FLOAT m13,
+ FLOAT m20, FLOAT m21, FLOAT m22, FLOAT m23,
+ FLOAT m30, FLOAT m31, FLOAT m32, FLOAT m33
+)
+{
+ m[0][0] = m00;
+ m[0][1] = m01;
+ m[0][2] = m02;
+ m[0][3] = m03;
+
+ m[1][0] = m10;
+ m[1][1] = m11;
+ m[1][2] = m12;
+ m[1][3] = m13;
+
+ m[2][0] = m20;
+ m[2][1] = m21;
+ m[2][2] = m22;
+ m[2][3] = m23;
+
+ m[3][0] = m30;
+ m[3][1] = m31;
+ m[3][2] = m32;
+ m[3][3] = m33;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT4X4::_XMFLOAT4X4
+(
+ CONST FLOAT* pArray
+)
+{
+ UINT Row;
+ UINT Column;
+
+ for (Row = 0; Row < 4; Row++)
+ {
+ for (Column = 0; Column < 4; Column++)
+ {
+ m[Row][Column] = pArray[Row * 4 + Column];
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT4X4& _XMFLOAT4X4::operator=
+(
+ CONST _XMFLOAT4X4& Float4x4
+)
+{
+ XMVECTOR V1 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._11);
+ XMVECTOR V2 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._21);
+ XMVECTOR V3 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._31);
+ XMVECTOR V4 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._41);
+
+ XMStoreFloat4((XMFLOAT4*)&_11, V1);
+ XMStoreFloat4((XMFLOAT4*)&_21, V2);
+ XMStoreFloat4((XMFLOAT4*)&_31, V3);
+ XMStoreFloat4((XMFLOAT4*)&_41, V4);
+
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMFLOAT4X4A& XMFLOAT4X4A::operator=
+(
+ CONST XMFLOAT4X4A& Float4x4
+)
+{
+ XMVECTOR V1 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._11);
+ XMVECTOR V2 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._21);
+ XMVECTOR V3 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._31);
+ XMVECTOR V4 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._41);
+
+ XMStoreFloat4A((XMFLOAT4A*)&_11, V1);
+ XMStoreFloat4A((XMFLOAT4A*)&_21, V2);
+ XMStoreFloat4A((XMFLOAT4A*)&_31, V3);
+ XMStoreFloat4A((XMFLOAT4A*)&_41, V4);
+
+ return *this;
+}
+
+#endif // __cplusplus
+
+#endif // __XNAMATHMATRIX_INL__
+
diff --git a/thirdparty/directxtex/XNAMath/xnamathmisc.inl b/thirdparty/directxtex/XNAMath/xnamathmisc.inl
new file mode 100644
index 00000000..d4d4ef2d
--- /dev/null
+++ b/thirdparty/directxtex/XNAMath/xnamathmisc.inl
@@ -0,0 +1,2460 @@
+/************************************************************************
+* *
+* xnamathmisc.inl -- SIMD C++ Math library for Windows and Xbox 360 *
+* Quaternion, plane, and color functions *
+* *
+* Copyright (c) Microsoft Corp. All rights reserved. *
+* *
+************************************************************************/
+
+#if defined(_MSC_VER) && (_MSC_VER > 1000)
+#pragma once
+#endif
+
+#ifndef __XNAMATHMISC_INL__
+#define __XNAMATHMISC_INL__
+
+/****************************************************************************
+ *
+ * Quaternion
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMQuaternionEqual
+(
+ FXMVECTOR Q1,
+ FXMVECTOR Q2
+)
+{
+ return XMVector4Equal(Q1, Q2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMQuaternionNotEqual
+(
+ FXMVECTOR Q1,
+ FXMVECTOR Q2
+)
+{
+ return XMVector4NotEqual(Q1, Q2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMQuaternionIsNaN
+(
+ FXMVECTOR Q
+)
+{
+ return XMVector4IsNaN(Q);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMQuaternionIsInfinite
+(
+ FXMVECTOR Q
+)
+{
+ return XMVector4IsInfinite(Q);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMQuaternionIsIdentity
+(
+ FXMVECTOR Q
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ return XMVector4Equal(Q, g_XMIdentityR3.v);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpeq_ps(Q,g_XMIdentityR3);
+ return (_mm_movemask_ps(vTemp)==0x0f);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionDot
+(
+ FXMVECTOR Q1,
+ FXMVECTOR Q2
+)
+{
+ return XMVector4Dot(Q1, Q2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionMultiply
+(
+ FXMVECTOR Q1,
+ FXMVECTOR Q2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR NegativeQ1;
+ XMVECTOR Q2X;
+ XMVECTOR Q2Y;
+ XMVECTOR Q2Z;
+ XMVECTOR Q2W;
+ XMVECTOR Q1WZYX;
+ XMVECTOR Q1ZWXY;
+ XMVECTOR Q1YXWZ;
+ XMVECTOR Result;
+ CONST XMVECTORU32 ControlWZYX = {XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1X};
+ CONST XMVECTORU32 ControlZWXY = {XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_1Y};
+ CONST XMVECTORU32 ControlYXWZ = {XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1Z};
+
+ NegativeQ1 = XMVectorNegate(Q1);
+
+ Q2W = XMVectorSplatW(Q2);
+ Q2X = XMVectorSplatX(Q2);
+ Q2Y = XMVectorSplatY(Q2);
+ Q2Z = XMVectorSplatZ(Q2);
+
+ Q1WZYX = XMVectorPermute(Q1, NegativeQ1, ControlWZYX.v);
+ Q1ZWXY = XMVectorPermute(Q1, NegativeQ1, ControlZWXY.v);
+ Q1YXWZ = XMVectorPermute(Q1, NegativeQ1, ControlYXWZ.v);
+
+ Result = XMVectorMultiply(Q1, Q2W);
+ Result = XMVectorMultiplyAdd(Q1WZYX, Q2X, Result);
+ Result = XMVectorMultiplyAdd(Q1ZWXY, Q2Y, Result);
+ Result = XMVectorMultiplyAdd(Q1YXWZ, Q2Z, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static CONST XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f};
+ static CONST XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f};
+ static CONST XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f};
+ // Copy to SSE registers and use as few as possible for x86
+ XMVECTOR Q2X = Q2;
+ XMVECTOR Q2Y = Q2;
+ XMVECTOR Q2Z = Q2;
+ XMVECTOR vResult = Q2;
+ // Splat with one instruction
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,3,3,3));
+ Q2X = _mm_shuffle_ps(Q2X,Q2X,_MM_SHUFFLE(0,0,0,0));
+ Q2Y = _mm_shuffle_ps(Q2Y,Q2Y,_MM_SHUFFLE(1,1,1,1));
+ Q2Z = _mm_shuffle_ps(Q2Z,Q2Z,_MM_SHUFFLE(2,2,2,2));
+ // Retire Q1 and perform Q1*Q2W
+ vResult = _mm_mul_ps(vResult,Q1);
+ XMVECTOR Q1Shuffle = Q1;
+ // Shuffle the copies of Q1
+ Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
+ // Mul by Q1WZYX
+ Q2X = _mm_mul_ps(Q2X,Q1Shuffle);
+ Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(2,3,0,1));
+ // Flip the signs on y and z
+ Q2X = _mm_mul_ps(Q2X,ControlWZYX);
+ // Mul by Q1ZWXY
+ Q2Y = _mm_mul_ps(Q2Y,Q1Shuffle);
+ Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
+ // Flip the signs on z and w
+ Q2Y = _mm_mul_ps(Q2Y,ControlZWXY);
+ // Mul by Q1YXWZ
+ Q2Z = _mm_mul_ps(Q2Z,Q1Shuffle);
+ vResult = _mm_add_ps(vResult,Q2X);
+ // Flip the signs on x and w
+ Q2Z = _mm_mul_ps(Q2Z,ControlYXWZ);
+ Q2Y = _mm_add_ps(Q2Y,Q2Z);
+ vResult = _mm_add_ps(vResult,Q2Y);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionLengthSq
+(
+ FXMVECTOR Q
+)
+{
+ return XMVector4LengthSq(Q);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionReciprocalLength
+(
+ FXMVECTOR Q
+)
+{
+ return XMVector4ReciprocalLength(Q);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionLength
+(
+ FXMVECTOR Q
+)
+{
+ return XMVector4Length(Q);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionNormalizeEst
+(
+ FXMVECTOR Q
+)
+{
+ return XMVector4NormalizeEst(Q);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionNormalize
+(
+ FXMVECTOR Q
+)
+{
+ return XMVector4Normalize(Q);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionConjugate
+(
+ FXMVECTOR Q
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result = {
+ -Q.vector4_f32[0],
+ -Q.vector4_f32[1],
+ -Q.vector4_f32[2],
+ Q.vector4_f32[3]
+ };
+ return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f};
+ XMVECTOR Result = _mm_mul_ps(Q,NegativeOne3);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionInverse
+(
+ FXMVECTOR Q
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Conjugate;
+ XMVECTOR L;
+ XMVECTOR Control;
+ XMVECTOR Result;
+ CONST XMVECTOR Zero = XMVectorZero();
+
+ L = XMVector4LengthSq(Q);
+ Conjugate = XMQuaternionConjugate(Q);
+
+ Control = XMVectorLessOrEqual(L, g_XMEpsilon.v);
+
+ L = XMVectorReciprocal(L);
+ Result = XMVectorMultiply(Conjugate, L);
+
+ Result = XMVectorSelect(Result, Zero, Control);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR Conjugate;
+ XMVECTOR L;
+ XMVECTOR Control;
+ XMVECTOR Result;
+ XMVECTOR Zero = XMVectorZero();
+
+ L = XMVector4LengthSq(Q);
+ Conjugate = XMQuaternionConjugate(Q);
+ Control = XMVectorLessOrEqual(L, g_XMEpsilon);
+ Result = _mm_div_ps(Conjugate,L);
+ Result = XMVectorSelect(Result, Zero, Control);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionLn
+(
+ FXMVECTOR Q
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Q0;
+ XMVECTOR QW;
+ XMVECTOR Theta;
+ XMVECTOR SinTheta;
+ XMVECTOR S;
+ XMVECTOR ControlW;
+ XMVECTOR Result;
+ static CONST XMVECTOR OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
+
+ QW = XMVectorSplatW(Q);
+ Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v);
+
+ ControlW = XMVectorInBounds(QW, OneMinusEpsilon);
+
+ Theta = XMVectorACos(QW);
+ SinTheta = XMVectorSin(Theta);
+
+ S = XMVectorReciprocal(SinTheta);
+ S = XMVectorMultiply(Theta, S);
+
+ Result = XMVectorMultiply(Q0, S);
+
+ Result = XMVectorSelect(Q0, Result, ControlW);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static CONST XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
+ static CONST XMVECTORF32 NegOneMinusEpsilon = {-(1.0f - 0.00001f), -(1.0f - 0.00001f),-(1.0f - 0.00001f),-(1.0f - 0.00001f)};
+ // Get W only
+ XMVECTOR QW = _mm_shuffle_ps(Q,Q,_MM_SHUFFLE(3,3,3,3));
+ // W = 0
+ XMVECTOR Q0 = _mm_and_ps(Q,g_XMMask3);
+ // Use W if within bounds
+ XMVECTOR ControlW = _mm_cmple_ps(QW,OneMinusEpsilon);
+ XMVECTOR vTemp2 = _mm_cmpge_ps(QW,NegOneMinusEpsilon);
+ ControlW = _mm_and_ps(ControlW,vTemp2);
+ // Get theta
+ XMVECTOR vTheta = XMVectorACos(QW);
+ // Get Sine of theta
+ vTemp2 = XMVectorSin(vTheta);
+ // theta/sine of theta
+ vTheta = _mm_div_ps(vTheta,vTemp2);
+ // Here's the answer
+ vTheta = _mm_mul_ps(vTheta,Q0);
+ // Was W in bounds? If not, return input as is
+ vTheta = XMVectorSelect(Q0,vTheta,ControlW);
+ return vTheta;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionExp
+(
+ FXMVECTOR Q
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Theta;
+ XMVECTOR SinTheta;
+ XMVECTOR CosTheta;
+ XMVECTOR S;
+ XMVECTOR Control;
+ XMVECTOR Zero;
+ XMVECTOR Result;
+
+ Theta = XMVector3Length(Q);
+ XMVectorSinCos(&SinTheta, &CosTheta, Theta);
+
+ S = XMVectorReciprocal(Theta);
+ S = XMVectorMultiply(SinTheta, S);
+
+ Result = XMVectorMultiply(Q, S);
+
+ Zero = XMVectorZero();
+ Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v);
+ Result = XMVectorSelect(Result, Q, Control);
+
+ Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR Theta;
+ XMVECTOR SinTheta;
+ XMVECTOR CosTheta;
+ XMVECTOR S;
+ XMVECTOR Control;
+ XMVECTOR Zero;
+ XMVECTOR Result;
+ Theta = XMVector3Length(Q);
+ XMVectorSinCos(&SinTheta, &CosTheta, Theta);
+ S = _mm_div_ps(SinTheta,Theta);
+ Result = _mm_mul_ps(Q, S);
+ Zero = XMVectorZero();
+ Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon);
+ Result = XMVectorSelect(Result,Q,Control);
+ Result = _mm_and_ps(Result,g_XMMask3);
+ CosTheta = _mm_and_ps(CosTheta,g_XMMaskW);
+ Result = _mm_or_ps(Result,CosTheta);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMQuaternionSlerp
+(
+ FXMVECTOR Q0,
+ FXMVECTOR Q1,
+ FLOAT t
+)
+{
+ XMVECTOR T = XMVectorReplicate(t);
+ return XMQuaternionSlerpV(Q0, Q1, T);
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMQuaternionSlerpV
+(
+ FXMVECTOR Q0,
+ FXMVECTOR Q1,
+ FXMVECTOR T
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega)
+ XMVECTOR Omega;
+ XMVECTOR CosOmega;
+ XMVECTOR SinOmega;
+ XMVECTOR InvSinOmega;
+ XMVECTOR V01;
+ XMVECTOR C1000;
+ XMVECTOR SignMask;
+ XMVECTOR S0;
+ XMVECTOR S1;
+ XMVECTOR Sign;
+ XMVECTOR Control;
+ XMVECTOR Result;
+ XMVECTOR Zero;
+ CONST XMVECTOR OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
+
+ XMASSERT((T.vector4_f32[1] == T.vector4_f32[0]) && (T.vector4_f32[2] == T.vector4_f32[0]) && (T.vector4_f32[3] == T.vector4_f32[0]));
+
+ CosOmega = XMQuaternionDot(Q0, Q1);
+
+ Zero = XMVectorZero();
+ Control = XMVectorLess(CosOmega, Zero);
+ Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control);
+
+ CosOmega = XMVectorMultiply(CosOmega, Sign);
+
+ Control = XMVectorLess(CosOmega, OneMinusEpsilon);
+
+ SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v);
+ SinOmega = XMVectorSqrt(SinOmega);
+
+ Omega = XMVectorATan2(SinOmega, CosOmega);
+
+ SignMask = XMVectorSplatSignMask();
+ C1000 = XMVectorSetBinaryConstant(1, 0, 0, 0);
+ V01 = XMVectorShiftLeft(T, Zero, 2);
+ SignMask = XMVectorShiftLeft(SignMask, Zero, 3);
+ V01 = XMVectorXorInt(V01, SignMask);
+ V01 = XMVectorAdd(C1000, V01);
+
+ InvSinOmega = XMVectorReciprocal(SinOmega);
+
+ S0 = XMVectorMultiply(V01, Omega);
+ S0 = XMVectorSin(S0);
+ S0 = XMVectorMultiply(S0, InvSinOmega);
+
+ S0 = XMVectorSelect(V01, S0, Control);
+
+ S1 = XMVectorSplatY(S0);
+ S0 = XMVectorSplatX(S0);
+
+ S1 = XMVectorMultiply(S1, Sign);
+
+ Result = XMVectorMultiply(Q0, S0);
+ Result = XMVectorMultiplyAdd(Q1, S1, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega)
+ XMVECTOR Omega;
+ XMVECTOR CosOmega;
+ XMVECTOR SinOmega;
+ XMVECTOR V01;
+ XMVECTOR S0;
+ XMVECTOR S1;
+ XMVECTOR Sign;
+ XMVECTOR Control;
+ XMVECTOR Result;
+ XMVECTOR Zero;
+ static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
+ static const XMVECTORI32 SignMask2 = {0x80000000,0x00000000,0x00000000,0x00000000};
+ static const XMVECTORI32 MaskXY = {0xFFFFFFFF,0xFFFFFFFF,0x00000000,0x00000000};
+
+ XMASSERT((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)));
+
+ CosOmega = XMQuaternionDot(Q0, Q1);
+
+ Zero = XMVectorZero();
+ Control = XMVectorLess(CosOmega, Zero);
+ Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control);
+
+ CosOmega = _mm_mul_ps(CosOmega, Sign);
+
+ Control = XMVectorLess(CosOmega, OneMinusEpsilon);
+
+ SinOmega = _mm_mul_ps(CosOmega,CosOmega);
+ SinOmega = _mm_sub_ps(g_XMOne,SinOmega);
+ SinOmega = _mm_sqrt_ps(SinOmega);
+
+ Omega = XMVectorATan2(SinOmega, CosOmega);
+
+ V01 = _mm_shuffle_ps(T,T,_MM_SHUFFLE(2,3,0,1));
+ V01 = _mm_and_ps(V01,MaskXY);
+ V01 = _mm_xor_ps(V01,SignMask2);
+ V01 = _mm_add_ps(g_XMIdentityR0, V01);
+
+ S0 = _mm_mul_ps(V01, Omega);
+ S0 = XMVectorSin(S0);
+ S0 = _mm_div_ps(S0, SinOmega);
+
+ S0 = XMVectorSelect(V01, S0, Control);
+
+ S1 = XMVectorSplatY(S0);
+ S0 = XMVectorSplatX(S0);
+
+ S1 = _mm_mul_ps(S1, Sign);
+ Result = _mm_mul_ps(Q0, S0);
+ S1 = _mm_mul_ps(S1, Q1);
+ Result = _mm_add_ps(Result,S1);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionSquad
+(
+ FXMVECTOR Q0,
+ FXMVECTOR Q1,
+ FXMVECTOR Q2,
+ CXMVECTOR Q3,
+ FLOAT t
+)
+{
+ XMVECTOR T = XMVectorReplicate(t);
+ return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionSquadV
+(
+ FXMVECTOR Q0,
+ FXMVECTOR Q1,
+ FXMVECTOR Q2,
+ CXMVECTOR Q3,
+ CXMVECTOR T
+)
+{
+ XMVECTOR Q03;
+ XMVECTOR Q12;
+ XMVECTOR TP;
+ XMVECTOR Two;
+ XMVECTOR Result;
+
+ XMASSERT( (XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)) );
+
+ TP = T;
+ Two = XMVectorSplatConstant(2, 0);
+
+ Q03 = XMQuaternionSlerpV(Q0, Q3, T);
+ Q12 = XMQuaternionSlerpV(Q1, Q2, T);
+
+ TP = XMVectorNegativeMultiplySubtract(TP, TP, TP);
+ TP = XMVectorMultiply(TP, Two);
+
+ Result = XMQuaternionSlerpV(Q03, Q12, TP);
+
+ return Result;
+
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE VOID XMQuaternionSquadSetup
+(
+ XMVECTOR* pA,
+ XMVECTOR* pB,
+ XMVECTOR* pC,
+ FXMVECTOR Q0,
+ FXMVECTOR Q1,
+ FXMVECTOR Q2,
+ CXMVECTOR Q3
+)
+{
+ XMVECTOR SQ0, SQ2, SQ3;
+ XMVECTOR InvQ1, InvQ2;
+ XMVECTOR LnQ0, LnQ1, LnQ2, LnQ3;
+ XMVECTOR ExpQ02, ExpQ13;
+ XMVECTOR LS01, LS12, LS23;
+ XMVECTOR LD01, LD12, LD23;
+ XMVECTOR Control0, Control1, Control2;
+ XMVECTOR NegativeOneQuarter;
+
+ XMASSERT(pA);
+ XMASSERT(pB);
+ XMASSERT(pC);
+
+ LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2));
+ LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2));
+ SQ2 = XMVectorNegate(Q2);
+
+ Control1 = XMVectorLess(LS12, LD12);
+ SQ2 = XMVectorSelect(Q2, SQ2, Control1);
+
+ LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1));
+ LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1));
+ SQ0 = XMVectorNegate(Q0);
+
+ LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3));
+ LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3));
+ SQ3 = XMVectorNegate(Q3);
+
+ Control0 = XMVectorLess(LS01, LD01);
+ Control2 = XMVectorLess(LS23, LD23);
+
+ SQ0 = XMVectorSelect(Q0, SQ0, Control0);
+ SQ3 = XMVectorSelect(Q3, SQ3, Control2);
+
+ InvQ1 = XMQuaternionInverse(Q1);
+ InvQ2 = XMQuaternionInverse(SQ2);
+
+ LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0));
+ LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2));
+ LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1));
+ LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3));
+
+ NegativeOneQuarter = XMVectorSplatConstant(-1, 2);
+
+ ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter);
+ ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter);
+ ExpQ02 = XMQuaternionExp(ExpQ02);
+ ExpQ13 = XMQuaternionExp(ExpQ13);
+
+ *pA = XMQuaternionMultiply(Q1, ExpQ02);
+ *pB = XMQuaternionMultiply(SQ2, ExpQ13);
+ *pC = SQ2;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionBaryCentric
+(
+ FXMVECTOR Q0,
+ FXMVECTOR Q1,
+ FXMVECTOR Q2,
+ FLOAT f,
+ FLOAT g
+)
+{
+ XMVECTOR Q01;
+ XMVECTOR Q02;
+ FLOAT s;
+ XMVECTOR Result;
+
+ s = f + g;
+
+ if ((s < 0.00001f) && (s > -0.00001f))
+ {
+ Result = Q0;
+ }
+ else
+ {
+ Q01 = XMQuaternionSlerp(Q0, Q1, s);
+ Q02 = XMQuaternionSlerp(Q0, Q2, s);
+
+ Result = XMQuaternionSlerp(Q01, Q02, g / s);
+ }
+
+ return Result;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionBaryCentricV
+(
+ FXMVECTOR Q0,
+ FXMVECTOR Q1,
+ FXMVECTOR Q2,
+ CXMVECTOR F,
+ CXMVECTOR G
+)
+{
+ XMVECTOR Q01;
+ XMVECTOR Q02;
+ XMVECTOR S, GS;
+ XMVECTOR Epsilon;
+ XMVECTOR Result;
+
+ XMASSERT( (XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F)) );
+ XMASSERT( (XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G)) );
+
+ Epsilon = XMVectorSplatConstant(1, 16);
+
+ S = XMVectorAdd(F, G);
+
+ if (XMVector4InBounds(S, Epsilon))
+ {
+ Result = Q0;
+ }
+ else
+ {
+ Q01 = XMQuaternionSlerpV(Q0, Q1, S);
+ Q02 = XMQuaternionSlerpV(Q0, Q2, S);
+ GS = XMVectorReciprocal(S);
+ GS = XMVectorMultiply(G, GS);
+
+ Result = XMQuaternionSlerpV(Q01, Q02, GS);
+ }
+
+ return Result;
+}
+
+//------------------------------------------------------------------------------
+// Transformation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionIdentity()
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return g_XMIdentityR3.v;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return g_XMIdentityR3;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionRotationRollPitchYaw
+(
+ FLOAT Pitch,
+ FLOAT Yaw,
+ FLOAT Roll
+)
+{
+ XMVECTOR Angles;
+ XMVECTOR Q;
+
+ Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
+ Q = XMQuaternionRotationRollPitchYawFromVector(Angles);
+
+ return Q;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionRotationRollPitchYawFromVector
+(
+ FXMVECTOR Angles // <Pitch, Yaw, Roll, 0>
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Q, Q0, Q1;
+ XMVECTOR P0, P1, Y0, Y1, R0, R1;
+ XMVECTOR HalfAngles;
+ XMVECTOR SinAngles, CosAngles;
+ static CONST XMVECTORU32 ControlPitch = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X};
+ static CONST XMVECTORU32 ControlYaw = {XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y};
+ static CONST XMVECTORU32 ControlRoll = {XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z};
+ static CONST XMVECTOR Sign = {1.0f, -1.0f, -1.0f, 1.0f};
+
+ HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v);
+ XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles);
+
+ P0 = XMVectorPermute(SinAngles, CosAngles, ControlPitch.v);
+ Y0 = XMVectorPermute(SinAngles, CosAngles, ControlYaw.v);
+ R0 = XMVectorPermute(SinAngles, CosAngles, ControlRoll.v);
+ P1 = XMVectorPermute(CosAngles, SinAngles, ControlPitch.v);
+ Y1 = XMVectorPermute(CosAngles, SinAngles, ControlYaw.v);
+ R1 = XMVectorPermute(CosAngles, SinAngles, ControlRoll.v);
+
+ Q1 = XMVectorMultiply(P1, Sign);
+ Q0 = XMVectorMultiply(P0, Y0);
+ Q1 = XMVectorMultiply(Q1, Y1);
+ Q0 = XMVectorMultiply(Q0, R0);
+ Q = XMVectorMultiplyAdd(Q1, R1, Q0);
+
+ return Q;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR Q, Q0, Q1;
+ XMVECTOR P0, P1, Y0, Y1, R0, R1;
+ XMVECTOR HalfAngles;
+ XMVECTOR SinAngles, CosAngles;
+ static CONST XMVECTORI32 ControlPitch = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X};
+ static CONST XMVECTORI32 ControlYaw = {XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y};
+ static CONST XMVECTORI32 ControlRoll = {XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z};
+ static CONST XMVECTORF32 Sign = {1.0f, -1.0f, -1.0f, 1.0f};
+
+ HalfAngles = _mm_mul_ps(Angles, g_XMOneHalf);
+ XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles);
+
+ P0 = XMVectorPermute(SinAngles, CosAngles, ControlPitch);
+ Y0 = XMVectorPermute(SinAngles, CosAngles, ControlYaw);
+ R0 = XMVectorPermute(SinAngles, CosAngles, ControlRoll);
+ P1 = XMVectorPermute(CosAngles, SinAngles, ControlPitch);
+ Y1 = XMVectorPermute(CosAngles, SinAngles, ControlYaw);
+ R1 = XMVectorPermute(CosAngles, SinAngles, ControlRoll);
+
+ Q1 = _mm_mul_ps(P1, Sign);
+ Q0 = _mm_mul_ps(P0, Y0);
+ Q1 = _mm_mul_ps(Q1, Y1);
+ Q0 = _mm_mul_ps(Q0, R0);
+ Q = _mm_mul_ps(Q1, R1);
+ Q = _mm_add_ps(Q,Q0);
+ return Q;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionRotationNormal
+(
+ FXMVECTOR NormalAxis,
+ FLOAT Angle
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Q;
+ XMVECTOR N;
+ XMVECTOR Scale;
+
+ N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v);
+
+ XMScalarSinCos(&Scale.vector4_f32[2], &Scale.vector4_f32[3], 0.5f * Angle);
+
+ Scale.vector4_f32[0] = Scale.vector4_f32[1] = Scale.vector4_f32[2];
+
+ Q = XMVectorMultiply(N, Scale);
+
+ return Q;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR N = _mm_and_ps(NormalAxis,g_XMMask3);
+ N = _mm_or_ps(N,g_XMIdentityR3);
+ XMVECTOR Scale = _mm_set_ps1(0.5f * Angle);
+ XMVECTOR vSine;
+ XMVECTOR vCosine;
+ XMVectorSinCos(&vSine,&vCosine,Scale);
+ Scale = _mm_and_ps(vSine,g_XMMask3);
+ vCosine = _mm_and_ps(vCosine,g_XMMaskW);
+ Scale = _mm_or_ps(Scale,vCosine);
+ N = _mm_mul_ps(N,Scale);
+ return N;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMQuaternionRotationAxis
+(
+ FXMVECTOR Axis,
+ FLOAT Angle
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Normal;
+ XMVECTOR Q;
+
+ XMASSERT(!XMVector3Equal(Axis, XMVectorZero()));
+ XMASSERT(!XMVector3IsInfinite(Axis));
+
+ Normal = XMVector3Normalize(Axis);
+ Q = XMQuaternionRotationNormal(Normal, Angle);
+
+ return Q;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR Normal;
+ XMVECTOR Q;
+
+ XMASSERT(!XMVector3Equal(Axis, XMVectorZero()));
+ XMASSERT(!XMVector3IsInfinite(Axis));
+
+ Normal = XMVector3Normalize(Axis);
+ Q = XMQuaternionRotationNormal(Normal, Angle);
+ return Q;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMQuaternionRotationMatrix
+(
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
+
+ XMVECTOR Q0, Q1, Q2;
+ XMVECTOR M00, M11, M22;
+ XMVECTOR CQ0, CQ1, C;
+ XMVECTOR CX, CY, CZ, CW;
+ XMVECTOR SQ1, Scale;
+ XMVECTOR Rsq, Sqrt, VEqualsNaN;
+ XMVECTOR A, B, P;
+ XMVECTOR PermuteSplat, PermuteSplatT;
+ XMVECTOR SignB, SignBT;
+ XMVECTOR PermuteControl, PermuteControlT;
+ XMVECTOR Result;
+ static CONST XMVECTORF32 OneQuarter = {0.25f, 0.25f, 0.25f, 0.25f};
+ static CONST XMVECTORF32 SignPNNP = {1.0f, -1.0f, -1.0f, 1.0f};
+ static CONST XMVECTORF32 SignNPNP = {-1.0f, 1.0f, -1.0f, 1.0f};
+ static CONST XMVECTORF32 SignNNPP = {-1.0f, -1.0f, 1.0f, 1.0f};
+ static CONST XMVECTORF32 SignPNPP = {1.0f, -1.0f, 1.0f, 1.0f};
+ static CONST XMVECTORF32 SignPPNP = {1.0f, 1.0f, -1.0f, 1.0f};
+ static CONST XMVECTORF32 SignNPPP = {-1.0f, 1.0f, 1.0f, 1.0f};
+ static CONST XMVECTORU32 Permute0X0X0Y0W = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W};
+ static CONST XMVECTORU32 Permute0Y0Z0Z1W = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_1W};
+ static CONST XMVECTORU32 SplatX = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X};
+ static CONST XMVECTORU32 SplatY = {XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y};
+ static CONST XMVECTORU32 SplatZ = {XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Z};
+ static CONST XMVECTORU32 SplatW = {XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W};
+ static CONST XMVECTORU32 PermuteC = {XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Y};
+ static CONST XMVECTORU32 PermuteA = {XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_0W};
+ static CONST XMVECTORU32 PermuteB = {XM_PERMUTE_1X, XM_PERMUTE_1W, XM_PERMUTE_0Z, XM_PERMUTE_0W};
+ static CONST XMVECTORU32 Permute0 = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Z, XM_PERMUTE_1Y};
+ static CONST XMVECTORU32 Permute1 = {XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Z};
+ static CONST XMVECTORU32 Permute2 = {XM_PERMUTE_1Z, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_1X};
+ static CONST XMVECTORU32 Permute3 = {XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_1X, XM_PERMUTE_0W};
+
+ M00 = XMVectorSplatX(M.r[0]);
+ M11 = XMVectorSplatY(M.r[1]);
+ M22 = XMVectorSplatZ(M.r[2]);
+
+ Q0 = XMVectorMultiply(SignPNNP.v, M00);
+ Q0 = XMVectorMultiplyAdd(SignNPNP.v, M11, Q0);
+ Q0 = XMVectorMultiplyAdd(SignNNPP.v, M22, Q0);
+
+ Q1 = XMVectorAdd(Q0, g_XMOne.v);
+
+ Rsq = XMVectorReciprocalSqrt(Q1);
+ VEqualsNaN = XMVectorIsNaN(Rsq);
+ Sqrt = XMVectorMultiply(Q1, Rsq);
+ Q1 = XMVectorSelect(Sqrt, Q1, VEqualsNaN);
+
+ Q1 = XMVectorMultiply(Q1, g_XMOneHalf.v);
+
+ SQ1 = XMVectorMultiply(Rsq, g_XMOneHalf.v);
+
+ CQ0 = XMVectorPermute(Q0, Q0, Permute0X0X0Y0W.v);
+ CQ1 = XMVectorPermute(Q0, g_XMEpsilon.v, Permute0Y0Z0Z1W.v);
+ C = XMVectorGreaterOrEqual(CQ0, CQ1);
+
+ CX = XMVectorSplatX(C);
+ CY = XMVectorSplatY(C);
+ CZ = XMVectorSplatZ(C);
+ CW = XMVectorSplatW(C);
+
+ PermuteSplat = XMVectorSelect(SplatZ.v, SplatY.v, CZ);
+ SignB = XMVectorSelect(SignNPPP.v, SignPPNP.v, CZ);
+ PermuteControl = XMVectorSelect(Permute2.v, Permute1.v, CZ);
+
+ PermuteSplat = XMVectorSelect(PermuteSplat, SplatZ.v, CX);
+ SignB = XMVectorSelect(SignB, SignNPPP.v, CX);
+ PermuteControl = XMVectorSelect(PermuteControl, Permute2.v, CX);
+
+ PermuteSplatT = XMVectorSelect(PermuteSplat,SplatX.v, CY);
+ SignBT = XMVectorSelect(SignB, SignPNPP.v, CY);
+ PermuteControlT = XMVectorSelect(PermuteControl,Permute0.v, CY);
+
+ PermuteSplat = XMVectorSelect(PermuteSplat, PermuteSplatT, CX);
+ SignB = XMVectorSelect(SignB, SignBT, CX);
+ PermuteControl = XMVectorSelect(PermuteControl, PermuteControlT, CX);
+
+ PermuteSplat = XMVectorSelect(PermuteSplat,SplatW.v, CW);
+ SignB = XMVectorSelect(SignB, g_XMNegativeOne.v, CW);
+ PermuteControl = XMVectorSelect(PermuteControl,Permute3.v, CW);
+
+ Scale = XMVectorPermute(SQ1, SQ1, PermuteSplat);
+
+ P = XMVectorPermute(M.r[1], M.r[2],PermuteC.v); // {M10, M12, M20, M21}
+ A = XMVectorPermute(M.r[0], P, PermuteA.v); // {M01, M12, M20, M03}
+ B = XMVectorPermute(M.r[0], P, PermuteB.v); // {M10, M21, M02, M03}
+
+ Q2 = XMVectorMultiplyAdd(SignB, B, A);
+ Q2 = XMVectorMultiply(Q2, Scale);
+
+ Result = XMVectorPermute(Q1, Q2, PermuteControl);
+
+ return Result;
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Conversion operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMQuaternionToAxisAngle
+(
+ XMVECTOR* pAxis,
+ FLOAT* pAngle,
+ FXMVECTOR Q
+)
+{
+ XMASSERT(pAxis);
+ XMASSERT(pAngle);
+
+ *pAxis = Q;
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+ *pAngle = 2.0f * acosf(XMVectorGetW(Q));
+#else
+ *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q));
+#endif
+}
+
+/****************************************************************************
+ *
+ * Plane
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMPlaneEqual
+(
+ FXMVECTOR P1,
+ FXMVECTOR P2
+)
+{
+ return XMVector4Equal(P1, P2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMPlaneNearEqual
+(
+ FXMVECTOR P1,
+ FXMVECTOR P2,
+ FXMVECTOR Epsilon
+)
+{
+ XMVECTOR NP1 = XMPlaneNormalize(P1);
+ XMVECTOR NP2 = XMPlaneNormalize(P2);
+ return XMVector4NearEqual(NP1, NP2, Epsilon);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMPlaneNotEqual
+(
+ FXMVECTOR P1,
+ FXMVECTOR P2
+)
+{
+ return XMVector4NotEqual(P1, P2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMPlaneIsNaN
+(
+ FXMVECTOR P
+)
+{
+ return XMVector4IsNaN(P);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMPlaneIsInfinite
+(
+ FXMVECTOR P
+)
+{
+ return XMVector4IsInfinite(P);
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneDot
+(
+ FXMVECTOR P,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ return XMVector4Dot(P, V);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128 vTemp2 = V;
+ __m128 vTemp = _mm_mul_ps(P,vTemp2);
+ vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
+ vTemp2 = _mm_add_ps(vTemp2,vTemp); // Add Z = X+Z; W = Y+W;
+ vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position
+ vTemp = _mm_add_ps(vTemp,vTemp2); // Add Z and W together
+ return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneDotCoord
+(
+ FXMVECTOR P,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V3;
+ XMVECTOR Result;
+
+ // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3]
+ V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v);
+ Result = XMVector4Dot(P, V3);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp2 = _mm_and_ps(V,g_XMMask3);
+ vTemp2 = _mm_or_ps(vTemp2,g_XMIdentityR3);
+ XMVECTOR vTemp = _mm_mul_ps(P,vTemp2);
+ vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
+ vTemp2 = _mm_add_ps(vTemp2,vTemp); // Add Z = X+Z; W = Y+W;
+ vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position
+ vTemp = _mm_add_ps(vTemp,vTemp2); // Add Z and W together
+ return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneDotNormal
+(
+ FXMVECTOR P,
+ FXMVECTOR V
+)
+{
+ return XMVector3Dot(P, V);
+}
+
+//------------------------------------------------------------------------------
+// XMPlaneNormalizeEst uses a reciprocal estimate and
+// returns QNaN on zero and infinite vectors.
+
+XMFINLINE XMVECTOR XMPlaneNormalizeEst
+(
+ FXMVECTOR P
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+ Result = XMVector3ReciprocalLength(P);
+ Result = XMVectorMultiply(P, Result);
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product
+ XMVECTOR vDot = _mm_mul_ps(P,P);
+ // x=Dot.y, y=Dot.z
+ XMVECTOR vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1));
+ // Result.x = x+y
+ vDot = _mm_add_ss(vDot,vTemp);
+ // x=Dot.z
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+ // Result.x = (x+y)+z
+ vDot = _mm_add_ss(vDot,vTemp);
+ // Splat x
+ vDot = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0));
+ // Get the reciprocal
+ vDot = _mm_rsqrt_ps(vDot);
+ // Get the reciprocal
+ vDot = _mm_mul_ps(vDot,P);
+ return vDot;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneNormalize
+(
+ FXMVECTOR P
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ FLOAT fLengthSq = sqrtf((P.vector4_f32[0]*P.vector4_f32[0])+(P.vector4_f32[1]*P.vector4_f32[1])+(P.vector4_f32[2]*P.vector4_f32[2]));
+ // Prevent divide by zero
+ if (fLengthSq) {
+ fLengthSq = 1.0f/fLengthSq;
+ }
+ {
+ XMVECTOR vResult = {
+ P.vector4_f32[0]*fLengthSq,
+ P.vector4_f32[1]*fLengthSq,
+ P.vector4_f32[2]*fLengthSq,
+ P.vector4_f32[3]*fLengthSq
+ };
+ return vResult;
+ }
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x,y and z only
+ XMVECTOR vLengthSq = _mm_mul_ps(P,P);
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,1,2,1));
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+ // Prepare for the division
+ XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+ // Failsafe on zero (Or epsilon) length planes
+ // If the length is infinity, set the elements to zero
+ vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+ // Reciprocal mul to perform the normalization
+ vResult = _mm_div_ps(P,vResult);
+ // Any that are infinity, set to zero
+ vResult = _mm_and_ps(vResult,vLengthSq);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneIntersectLine
+(
+ FXMVECTOR P,
+ FXMVECTOR LinePoint1,
+ FXMVECTOR LinePoint2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V1;
+ XMVECTOR V2;
+ XMVECTOR D;
+ XMVECTOR ReciprocalD;
+ XMVECTOR VT;
+ XMVECTOR Point;
+ XMVECTOR Zero;
+ XMVECTOR Control;
+ XMVECTOR Result;
+
+ V1 = XMVector3Dot(P, LinePoint1);
+ V2 = XMVector3Dot(P, LinePoint2);
+ D = XMVectorSubtract(V1, V2);
+
+ ReciprocalD = XMVectorReciprocal(D);
+ VT = XMPlaneDotCoord(P, LinePoint1);
+ VT = XMVectorMultiply(VT, ReciprocalD);
+
+ Point = XMVectorSubtract(LinePoint2, LinePoint1);
+ Point = XMVectorMultiplyAdd(Point, VT, LinePoint1);
+
+ Zero = XMVectorZero();
+ Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v);
+
+ Result = XMVectorSelect(Point, g_XMQNaN.v, Control);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR V1;
+ XMVECTOR V2;
+ XMVECTOR D;
+ XMVECTOR VT;
+ XMVECTOR Point;
+ XMVECTOR Zero;
+ XMVECTOR Control;
+ XMVECTOR Result;
+
+ V1 = XMVector3Dot(P, LinePoint1);
+ V2 = XMVector3Dot(P, LinePoint2);
+ D = _mm_sub_ps(V1, V2);
+
+ VT = XMPlaneDotCoord(P, LinePoint1);
+ VT = _mm_div_ps(VT, D);
+
+ Point = _mm_sub_ps(LinePoint2, LinePoint1);
+ Point = _mm_mul_ps(Point,VT);
+ Point = _mm_add_ps(Point,LinePoint1);
+ Zero = XMVectorZero();
+ Control = XMVectorNearEqual(D, Zero, g_XMEpsilon);
+ Result = XMVectorSelect(Point, g_XMQNaN, Control);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE VOID XMPlaneIntersectPlane
+(
+ XMVECTOR* pLinePoint1,
+ XMVECTOR* pLinePoint2,
+ FXMVECTOR P1,
+ FXMVECTOR P2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V1;
+ XMVECTOR V2;
+ XMVECTOR V3;
+ XMVECTOR LengthSq;
+ XMVECTOR RcpLengthSq;
+ XMVECTOR Point;
+ XMVECTOR P1W;
+ XMVECTOR P2W;
+ XMVECTOR Control;
+ XMVECTOR LinePoint1;
+ XMVECTOR LinePoint2;
+
+ XMASSERT(pLinePoint1);
+ XMASSERT(pLinePoint2);
+
+ V1 = XMVector3Cross(P2, P1);
+
+ LengthSq = XMVector3LengthSq(V1);
+
+ V2 = XMVector3Cross(P2, V1);
+
+ P1W = XMVectorSplatW(P1);
+ Point = XMVectorMultiply(V2, P1W);
+
+ V3 = XMVector3Cross(V1, P1);
+
+ P2W = XMVectorSplatW(P2);
+ Point = XMVectorMultiplyAdd(V3, P2W, Point);
+
+ RcpLengthSq = XMVectorReciprocal(LengthSq);
+ LinePoint1 = XMVectorMultiply(Point, RcpLengthSq);
+
+ LinePoint2 = XMVectorAdd(LinePoint1, V1);
+
+ Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v);
+ *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN.v, Control);
+ *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN.v, Control);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pLinePoint1);
+ XMASSERT(pLinePoint2);
+ XMVECTOR V1;
+ XMVECTOR V2;
+ XMVECTOR V3;
+ XMVECTOR LengthSq;
+ XMVECTOR Point;
+ XMVECTOR P1W;
+ XMVECTOR P2W;
+ XMVECTOR Control;
+ XMVECTOR LinePoint1;
+ XMVECTOR LinePoint2;
+
+ V1 = XMVector3Cross(P2, P1);
+
+ LengthSq = XMVector3LengthSq(V1);
+
+ V2 = XMVector3Cross(P2, V1);
+
+ P1W = _mm_shuffle_ps(P1,P1,_MM_SHUFFLE(3,3,3,3));
+ Point = _mm_mul_ps(V2, P1W);
+
+ V3 = XMVector3Cross(V1, P1);
+
+ P2W = _mm_shuffle_ps(P2,P2,_MM_SHUFFLE(3,3,3,3));
+ V3 = _mm_mul_ps(V3,P2W);
+ Point = _mm_add_ps(Point,V3);
+ LinePoint1 = _mm_div_ps(Point,LengthSq);
+
+ LinePoint2 = _mm_add_ps(LinePoint1, V1);
+
+ Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon);
+ *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN, Control);
+ *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN, Control);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneTransform
+(
+ FXMVECTOR P,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR X;
+ XMVECTOR Y;
+ XMVECTOR Z;
+ XMVECTOR W;
+ XMVECTOR Result;
+
+ W = XMVectorSplatW(P);
+ Z = XMVectorSplatZ(P);
+ Y = XMVectorSplatY(P);
+ X = XMVectorSplatX(P);
+
+ Result = XMVectorMultiply(W, M.r[3]);
+ Result = XMVectorMultiplyAdd(Z, M.r[2], Result);
+ Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+ Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR X = _mm_shuffle_ps(P,P,_MM_SHUFFLE(0,0,0,0));
+ XMVECTOR Y = _mm_shuffle_ps(P,P,_MM_SHUFFLE(1,1,1,1));
+ XMVECTOR Z = _mm_shuffle_ps(P,P,_MM_SHUFFLE(2,2,2,2));
+ XMVECTOR W = _mm_shuffle_ps(P,P,_MM_SHUFFLE(3,3,3,3));
+ X = _mm_mul_ps(X, M.r[0]);
+ Y = _mm_mul_ps(Y, M.r[1]);
+ Z = _mm_mul_ps(Z, M.r[2]);
+ W = _mm_mul_ps(W, M.r[3]);
+ X = _mm_add_ps(X,Z);
+ Y = _mm_add_ps(Y,W);
+ X = _mm_add_ps(X,Y);
+ return X;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMFLOAT4* XMPlaneTransformStream
+(
+ XMFLOAT4* pOutputStream,
+ size_t OutputStride,
+ CONST XMFLOAT4* pInputStream,
+ size_t InputStride,
+ size_t PlaneCount,
+ CXMMATRIX M
+)
+{
+ return XMVector4TransformStream(pOutputStream,
+ OutputStride,
+ pInputStream,
+ InputStride,
+ PlaneCount,
+ M);
+}
+
+//------------------------------------------------------------------------------
+// Conversion operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneFromPointNormal
+(
+ FXMVECTOR Point,
+ FXMVECTOR Normal
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR W;
+ XMVECTOR Result;
+
+ W = XMVector3Dot(Point, Normal);
+ W = XMVectorNegate(W);
+ Result = XMVectorSelect(W, Normal, g_XMSelect1110.v);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR W;
+ XMVECTOR Result;
+ W = XMVector3Dot(Point,Normal);
+ W = _mm_mul_ps(W,g_XMNegativeOne);
+ Result = _mm_and_ps(Normal,g_XMMask3);
+ W = _mm_and_ps(W,g_XMMaskW);
+ Result = _mm_or_ps(Result,W);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMPlaneFromPoints
+(
+ FXMVECTOR Point1,
+ FXMVECTOR Point2,
+ FXMVECTOR Point3
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR N;
+ XMVECTOR D;
+ XMVECTOR V21;
+ XMVECTOR V31;
+ XMVECTOR Result;
+
+ V21 = XMVectorSubtract(Point1, Point2);
+ V31 = XMVectorSubtract(Point1, Point3);
+
+ N = XMVector3Cross(V21, V31);
+ N = XMVector3Normalize(N);
+
+ D = XMPlaneDotNormal(N, Point1);
+ D = XMVectorNegate(D);
+
+ Result = XMVectorSelect(D, N, g_XMSelect1110.v);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR N;
+ XMVECTOR D;
+ XMVECTOR V21;
+ XMVECTOR V31;
+ XMVECTOR Result;
+
+ V21 = _mm_sub_ps(Point1, Point2);
+ V31 = _mm_sub_ps(Point1, Point3);
+
+ N = XMVector3Cross(V21, V31);
+ N = XMVector3Normalize(N);
+
+ D = XMPlaneDotNormal(N, Point1);
+ D = _mm_mul_ps(D,g_XMNegativeOne);
+ N = _mm_and_ps(N,g_XMMask3);
+ D = _mm_and_ps(D,g_XMMaskW);
+ Result = _mm_or_ps(D,N);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+/****************************************************************************
+ *
+ * Color
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorEqual
+(
+ FXMVECTOR C1,
+ FXMVECTOR C2
+)
+{
+ return XMVector4Equal(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorNotEqual
+(
+ FXMVECTOR C1,
+ FXMVECTOR C2
+)
+{
+ return XMVector4NotEqual(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorGreater
+(
+ FXMVECTOR C1,
+ FXMVECTOR C2
+)
+{
+ return XMVector4Greater(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorGreaterOrEqual
+(
+ FXMVECTOR C1,
+ FXMVECTOR C2
+)
+{
+ return XMVector4GreaterOrEqual(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorLess
+(
+ FXMVECTOR C1,
+ FXMVECTOR C2
+)
+{
+ return XMVector4Less(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorLessOrEqual
+(
+ FXMVECTOR C1,
+ FXMVECTOR C2
+)
+{
+ return XMVector4LessOrEqual(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorIsNaN
+(
+ FXMVECTOR C
+)
+{
+ return XMVector4IsNaN(C);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMColorIsInfinite
+(
+ FXMVECTOR C
+)
+{
+ return XMVector4IsInfinite(C);
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMColorNegative
+(
+ FXMVECTOR vColor
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+// XMASSERT(XMVector4GreaterOrEqual(C, XMVectorReplicate(0.0f)));
+// XMASSERT(XMVector4LessOrEqual(C, XMVectorReplicate(1.0f)));
+ XMVECTOR vResult = {
+ 1.0f - vColor.vector4_f32[0],
+ 1.0f - vColor.vector4_f32[1],
+ 1.0f - vColor.vector4_f32[2],
+ vColor.vector4_f32[3]
+ };
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Negate only x,y and z.
+ XMVECTOR vTemp = _mm_xor_ps(vColor,g_XMNegate3);
+ // Add 1,1,1,0 to -x,-y,-z,w
+ return _mm_add_ps(vTemp,g_XMOne3);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMColorModulate
+(
+ FXMVECTOR C1,
+ FXMVECTOR C2
+)
+{
+ return XMVectorMultiply(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMColorAdjustSaturation
+(
+ FXMVECTOR vColor,
+ FLOAT fSaturation
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ CONST XMVECTOR gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
+
+ // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2];
+ // Result = (C - Luminance) * Saturation + Luminance;
+
+ FLOAT fLuminance = (vColor.vector4_f32[0]*gvLuminance.vector4_f32[0])+(vColor.vector4_f32[1]*gvLuminance.vector4_f32[1])+(vColor.vector4_f32[2]*gvLuminance.vector4_f32[2]);
+ XMVECTOR vResult = {
+ ((vColor.vector4_f32[0] - fLuminance)*fSaturation)+fLuminance,
+ ((vColor.vector4_f32[1] - fLuminance)*fSaturation)+fLuminance,
+ ((vColor.vector4_f32[2] - fLuminance)*fSaturation)+fLuminance,
+ vColor.vector4_f32[3]};
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
+// Mul RGB by intensity constants
+ XMVECTOR vLuminance = _mm_mul_ps(vColor,gvLuminance);
+// vResult.x = vLuminance.y, vResult.y = vLuminance.y,
+// vResult.z = vLuminance.z, vResult.w = vLuminance.z
+ XMVECTOR vResult = vLuminance;
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,1,1));
+// vLuminance.x += vLuminance.y
+ vLuminance = _mm_add_ss(vLuminance,vResult);
+// Splat vLuminance.z
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,2,2));
+// vLuminance.x += vLuminance.z (Dot product)
+ vLuminance = _mm_add_ss(vLuminance,vResult);
+// Splat vLuminance
+ vLuminance = _mm_shuffle_ps(vLuminance,vLuminance,_MM_SHUFFLE(0,0,0,0));
+// Splat fSaturation
+ XMVECTOR vSaturation = _mm_set_ps1(fSaturation);
+// vResult = ((vColor-vLuminance)*vSaturation)+vLuminance;
+ vResult = _mm_sub_ps(vColor,vLuminance);
+ vResult = _mm_mul_ps(vResult,vSaturation);
+ vResult = _mm_add_ps(vResult,vLuminance);
+// Retain w from the source color
+ vLuminance = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
+ vResult = _mm_shuffle_ps(vResult,vLuminance,_MM_SHUFFLE(3,0,1,0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
+ return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMColorAdjustContrast
+(
+ FXMVECTOR vColor,
+ FLOAT fContrast
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ // Result = (vColor - 0.5f) * fContrast + 0.5f;
+ XMVECTOR vResult = {
+ ((vColor.vector4_f32[0]-0.5f) * fContrast) + 0.5f,
+ ((vColor.vector4_f32[1]-0.5f) * fContrast) + 0.5f,
+ ((vColor.vector4_f32[2]-0.5f) * fContrast) + 0.5f,
+ vColor.vector4_f32[3] // Leave W untouched
+ };
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vScale = _mm_set_ps1(fContrast); // Splat the scale
+ XMVECTOR vResult = _mm_sub_ps(vColor,g_XMOneHalf); // Subtract 0.5f from the source (Saving source)
+ vResult = _mm_mul_ps(vResult,vScale); // Mul by scale
+ vResult = _mm_add_ps(vResult,g_XMOneHalf); // Add 0.5f
+// Retain w from the source color
+ vScale = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
+ vResult = _mm_shuffle_ps(vResult,vScale,_MM_SHUFFLE(3,0,1,0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
+ return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+/****************************************************************************
+ *
+ * Miscellaneous
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMINLINE BOOL XMVerifyCPUSupport()
+{
+#if defined(_XM_NO_INTRINSICS_) || !defined(_XM_SSE_INTRINSICS_)
+ return TRUE;
+#else // _XM_SSE_INTRINSICS_
+ // Note that on Windows 2000 or older, SSE2 detection is not supported so this will always fail
+ // Detecting SSE2 on older versions of Windows would require using cpuid directly
+ return ( IsProcessorFeaturePresent( PF_XMMI_INSTRUCTIONS_AVAILABLE ) && IsProcessorFeaturePresent( PF_XMMI64_INSTRUCTIONS_AVAILABLE ) );
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+
+#define XMASSERT_LINE_STRING_SIZE 16
+
+XMINLINE VOID XMAssert
+(
+ CONST CHAR* pExpression,
+ CONST CHAR* pFileName,
+ UINT LineNumber
+)
+{
+ CHAR aLineString[XMASSERT_LINE_STRING_SIZE];
+ CHAR* pLineString;
+ UINT Line;
+
+ aLineString[XMASSERT_LINE_STRING_SIZE - 2] = '0';
+ aLineString[XMASSERT_LINE_STRING_SIZE - 1] = '\0';
+ for (Line = LineNumber, pLineString = aLineString + XMASSERT_LINE_STRING_SIZE - 2;
+ Line != 0 && pLineString >= aLineString;
+ Line /= 10, pLineString--)
+ {
+ *pLineString = (CHAR)('0' + (Line % 10));
+ }
+
+#ifndef NO_OUTPUT_DEBUG_STRING
+ OutputDebugStringA("Assertion failed: ");
+ OutputDebugStringA(pExpression);
+ OutputDebugStringA(", file ");
+ OutputDebugStringA(pFileName);
+ OutputDebugStringA(", line ");
+ OutputDebugStringA(pLineString + 1);
+ OutputDebugStringA("\r\n");
+#else
+ DbgPrint("Assertion failed: %s, file %s, line %d\r\n", pExpression, pFileName, LineNumber);
+#endif
+
+ __debugbreak();
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMFresnelTerm
+(
+ FXMVECTOR CosIncidentAngle,
+ FXMVECTOR RefractionIndex
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR G;
+ XMVECTOR D, S;
+ XMVECTOR V0, V1, V2, V3;
+ XMVECTOR Result;
+
+ // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where
+ // c = CosIncidentAngle
+ // g = sqrt(c^2 + RefractionIndex^2 - 1)
+
+ XMASSERT(!XMVector4IsInfinite(CosIncidentAngle));
+
+ G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v);
+ G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G);
+ G = XMVectorAbs(G);
+ G = XMVectorSqrt(G);
+
+ S = XMVectorAdd(G, CosIncidentAngle);
+ D = XMVectorSubtract(G, CosIncidentAngle);
+
+ V0 = XMVectorMultiply(D, D);
+ V1 = XMVectorMultiply(S, S);
+ V1 = XMVectorReciprocal(V1);
+ V0 = XMVectorMultiply(g_XMOneHalf.v, V0);
+ V0 = XMVectorMultiply(V0, V1);
+
+ V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v);
+ V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v);
+ V2 = XMVectorMultiply(V2, V2);
+ V3 = XMVectorMultiply(V3, V3);
+ V3 = XMVectorReciprocal(V3);
+ V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v);
+
+ Result = XMVectorMultiply(V0, V2);
+
+ Result = XMVectorSaturate(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where
+ // c = CosIncidentAngle
+ // g = sqrt(c^2 + RefractionIndex^2 - 1)
+
+ XMASSERT(!XMVector4IsInfinite(CosIncidentAngle));
+
+ // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2))
+ XMVECTOR G = _mm_mul_ps(RefractionIndex,RefractionIndex);
+ XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle,CosIncidentAngle);
+ G = _mm_sub_ps(G,g_XMOne);
+ vTemp = _mm_add_ps(vTemp,G);
+ // max((0-vTemp),vTemp) == abs(vTemp)
+ // The abs is needed to deal with refraction and cosine being zero
+ G = _mm_setzero_ps();
+ G = _mm_sub_ps(G,vTemp);
+ G = _mm_max_ps(G,vTemp);
+ // Last operation, the sqrt()
+ G = _mm_sqrt_ps(G);
+
+ // Calc G-C and G+C
+ XMVECTOR GAddC = _mm_add_ps(G,CosIncidentAngle);
+ XMVECTOR GSubC = _mm_sub_ps(G,CosIncidentAngle);
+ // Perform the term (0.5f *(g - c)^2) / (g + c)^2
+ XMVECTOR vResult = _mm_mul_ps(GSubC,GSubC);
+ vTemp = _mm_mul_ps(GAddC,GAddC);
+ vResult = _mm_mul_ps(vResult,g_XMOneHalf);
+ vResult = _mm_div_ps(vResult,vTemp);
+ // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1)
+ GAddC = _mm_mul_ps(GAddC,CosIncidentAngle);
+ GSubC = _mm_mul_ps(GSubC,CosIncidentAngle);
+ GAddC = _mm_sub_ps(GAddC,g_XMOne);
+ GSubC = _mm_add_ps(GSubC,g_XMOne);
+ GAddC = _mm_mul_ps(GAddC,GAddC);
+ GSubC = _mm_mul_ps(GSubC,GSubC);
+ GAddC = _mm_div_ps(GAddC,GSubC);
+ GAddC = _mm_add_ps(GAddC,g_XMOne);
+ // Multiply the two term parts
+ vResult = _mm_mul_ps(vResult,GAddC);
+ // Clamp to 0.0 - 1.0f
+ vResult = _mm_max_ps(vResult,g_XMZero);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMScalarNearEqual
+(
+ FLOAT S1,
+ FLOAT S2,
+ FLOAT Epsilon
+)
+{
+ FLOAT Delta = S1 - S2;
+#if defined(_XM_NO_INTRINSICS_)
+ UINT AbsDelta = *(const UINT*)&Delta & 0x7FFFFFFF;
+ return (*(FLOAT*)&AbsDelta <= Epsilon);
+#elif defined(_XM_SSE_INTRINSICS_)
+ return (fabsf(Delta) <= Epsilon);
+#else
+ return (__fabs(Delta) <= Epsilon);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI
+XMFINLINE FLOAT XMScalarModAngle
+(
+ FLOAT Angle
+)
+{
+ // Note: The modulo is performed with unsigned math only to work
+ // around a precision error on numbers that are close to PI
+ float fTemp;
+#if defined(_XM_NO_INTRINSICS_) || !defined(_XM_VMX128_INTRINSICS_)
+ // Normalize the range from 0.0f to XM_2PI
+ Angle = Angle + XM_PI;
+ // Perform the modulo, unsigned
+ fTemp = fabsf(Angle);
+ fTemp = fTemp - (XM_2PI * (FLOAT)((INT)(fTemp/XM_2PI)));
+ // Restore the number to the range of -XM_PI to XM_PI-epsilon
+ fTemp = fTemp - XM_PI;
+ // If the modulo'd value was negative, restore negation
+ if (Angle<0.0f) {
+ fTemp = -fTemp;
+ }
+ return fTemp;
+#else
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE FLOAT XMScalarSin
+(
+ FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT ValueMod;
+ FLOAT ValueSq;
+ XMVECTOR V0123, V0246, V1357, V9111315, V17192123;
+ XMVECTOR V1, V7, V8;
+ XMVECTOR R0, R1, R2;
+
+ ValueMod = XMScalarModAngle(Value);
+
+ // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - V^15 / 15! +
+ // V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+
+ ValueSq = ValueMod * ValueMod;
+
+ V0123 = XMVectorSet(1.0f, ValueMod, ValueSq, ValueSq * ValueMod);
+ V1 = XMVectorSplatY(V0123);
+ V0246 = XMVectorMultiply(V0123, V0123);
+ V1357 = XMVectorMultiply(V0246, V1);
+ V7 = XMVectorSplatW(V1357);
+ V8 = XMVectorMultiply(V7, V1);
+ V9111315 = XMVectorMultiply(V1357, V8);
+ V17192123 = XMVectorMultiply(V9111315, V8);
+
+ R0 = XMVector4Dot(V1357, g_XMSinCoefficients0.v);
+ R1 = XMVector4Dot(V9111315, g_XMSinCoefficients1.v);
+ R2 = XMVector4Dot(V17192123, g_XMSinCoefficients2.v);
+
+ return R0.vector4_f32[0] + R1.vector4_f32[0] + R2.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return sinf( Value );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE FLOAT XMScalarCos
+(
+ FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT ValueMod;
+ FLOAT ValueSq;
+ XMVECTOR V0123, V0246, V8101214, V16182022;
+ XMVECTOR V2, V6, V8;
+ XMVECTOR R0, R1, R2;
+
+ ValueMod = XMScalarModAngle(Value);
+
+ // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! +
+ // V^12 / 12! - V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+
+ ValueSq = ValueMod * ValueMod;
+
+ V0123 = XMVectorSet(1.0f, ValueMod, ValueSq, ValueSq * ValueMod);
+ V0246 = XMVectorMultiply(V0123, V0123);
+
+ V2 = XMVectorSplatZ(V0123);
+ V6 = XMVectorSplatW(V0246);
+ V8 = XMVectorMultiply(V6, V2);
+
+ V8101214 = XMVectorMultiply(V0246, V8);
+ V16182022 = XMVectorMultiply(V8101214, V8);
+
+ R0 = XMVector4Dot(V0246, g_XMCosCoefficients0.v);
+ R1 = XMVector4Dot(V8101214, g_XMCosCoefficients1.v);
+ R2 = XMVector4Dot(V16182022, g_XMCosCoefficients2.v);
+
+ return R0.vector4_f32[0] + R1.vector4_f32[0] + R2.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return cosf(Value);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE VOID XMScalarSinCos
+(
+ FLOAT* pSin,
+ FLOAT* pCos,
+ FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT ValueMod;
+ FLOAT ValueSq;
+ XMVECTOR V0123, V0246, V1357, V8101214, V9111315, V16182022, V17192123;
+ XMVECTOR V1, V2, V6, V8;
+ XMVECTOR S0, S1, S2, C0, C1, C2;
+
+ XMASSERT(pSin);
+ XMASSERT(pCos);
+
+ ValueMod = XMScalarModAngle(Value);
+
+ // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - V^15 / 15! +
+ // V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+ // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! +
+ // V^12 / 12! - V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+
+ ValueSq = ValueMod * ValueMod;
+
+ V0123 = XMVectorSet(1.0f, ValueMod, ValueSq, ValueSq * ValueMod);
+
+ V1 = XMVectorSplatY(V0123);
+ V2 = XMVectorSplatZ(V0123);
+
+ V0246 = XMVectorMultiply(V0123, V0123);
+ V1357 = XMVectorMultiply(V0246, V1);
+
+ V6 = XMVectorSplatW(V0246);
+ V8 = XMVectorMultiply(V6, V2);
+
+ V8101214 = XMVectorMultiply(V0246, V8);
+ V9111315 = XMVectorMultiply(V1357, V8);
+ V16182022 = XMVectorMultiply(V8101214, V8);
+ V17192123 = XMVectorMultiply(V9111315, V8);
+
+ C0 = XMVector4Dot(V0246, g_XMCosCoefficients0.v);
+ S0 = XMVector4Dot(V1357, g_XMSinCoefficients0.v);
+ C1 = XMVector4Dot(V8101214, g_XMCosCoefficients1.v);
+ S1 = XMVector4Dot(V9111315, g_XMSinCoefficients1.v);
+ C2 = XMVector4Dot(V16182022, g_XMCosCoefficients2.v);
+ S2 = XMVector4Dot(V17192123, g_XMSinCoefficients2.v);
+
+ *pCos = C0.vector4_f32[0] + C1.vector4_f32[0] + C2.vector4_f32[0];
+ *pSin = S0.vector4_f32[0] + S1.vector4_f32[0] + S2.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSin);
+ XMASSERT(pCos);
+
+ *pSin = sinf(Value);
+ *pCos = cosf(Value);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE FLOAT XMScalarASin
+(
+ FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT AbsValue, Value2, Value3, D;
+ XMVECTOR AbsV, R0, R1, Result;
+ XMVECTOR V3;
+
+ *(UINT*)&AbsValue = *(const UINT*)&Value & 0x7FFFFFFF;
+
+ Value2 = Value * AbsValue;
+ Value3 = Value * Value2;
+ D = (Value - Value2) / sqrtf(1.00000011921f - AbsValue);
+
+ AbsV = XMVectorReplicate(AbsValue);
+
+ V3.vector4_f32[0] = Value3;
+ V3.vector4_f32[1] = 1.0f;
+ V3.vector4_f32[2] = Value3;
+ V3.vector4_f32[3] = 1.0f;
+
+ R1 = XMVectorSet(D, D, Value, Value);
+ R1 = XMVectorMultiply(R1, V3);
+
+ R0 = XMVectorMultiplyAdd(AbsV, g_XMASinCoefficients0.v, g_XMASinCoefficients1.v);
+ R0 = XMVectorMultiplyAdd(AbsV, R0, g_XMASinCoefficients2.v);
+
+ Result = XMVector4Dot(R0, R1);
+
+ return Result.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return asinf(Value);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE FLOAT XMScalarACos
+(
+ FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ return XM_PIDIV2 - XMScalarASin(Value);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return acosf(Value);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE FLOAT XMScalarSinEst
+(
+ FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT ValueSq;
+ XMVECTOR V;
+ XMVECTOR Y;
+ XMVECTOR Result;
+
+ XMASSERT(Value >= -XM_PI);
+ XMASSERT(Value < XM_PI);
+
+ // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI)
+
+ ValueSq = Value * Value;
+
+ V = XMVectorSet(1.0f, Value, ValueSq, ValueSq * Value);
+ Y = XMVectorSplatY(V);
+ V = XMVectorMultiply(V, V);
+ V = XMVectorMultiply(V, Y);
+
+ Result = XMVector4Dot(V, g_XMSinEstCoefficients.v);
+
+ return Result.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(Value >= -XM_PI);
+ XMASSERT(Value < XM_PI);
+ float ValueSq = Value*Value;
+ XMVECTOR vValue = _mm_set_ps1(Value);
+ XMVECTOR vTemp = _mm_set_ps(ValueSq * Value,ValueSq,Value,1.0f);
+ vTemp = _mm_mul_ps(vTemp,vTemp);
+ vTemp = _mm_mul_ps(vTemp,vValue);
+ // vTemp = Value,Value^3,Value^5,Value^7
+ vTemp = _mm_mul_ps(vTemp,g_XMSinEstCoefficients);
+ vValue = _mm_shuffle_ps(vValue,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
+ vValue = _mm_add_ps(vValue,vTemp); // Add Z = X+Z; W = Y+W;
+ vTemp = _mm_shuffle_ps(vTemp,vValue,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position
+ vTemp = _mm_add_ps(vTemp,vValue); // Add Z and W together
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return
+#if defined(_MSC_VER) && (_MSC_VER>=1500)
+ return _mm_cvtss_f32(vTemp);
+#else
+ return vTemp.m128_f32[0];
+#endif
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE FLOAT XMScalarCosEst
+(
+ FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ FLOAT ValueSq;
+ XMVECTOR V;
+ XMVECTOR Result;
+ XMASSERT(Value >= -XM_PI);
+ XMASSERT(Value < XM_PI);
+ // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! (for -PI <= V < PI)
+ ValueSq = Value * Value;
+ V = XMVectorSet(1.0f, Value, ValueSq, ValueSq * Value);
+ V = XMVectorMultiply(V, V);
+ Result = XMVector4Dot(V, g_XMCosEstCoefficients.v);
+ return Result.vector4_f32[0];
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(Value >= -XM_PI);
+ XMASSERT(Value < XM_PI);
+ float ValueSq = Value*Value;
+ XMVECTOR vValue = _mm_setzero_ps();
+ XMVECTOR vTemp = _mm_set_ps(ValueSq * Value,ValueSq,Value,1.0f);
+ vTemp = _mm_mul_ps(vTemp,vTemp);
+ // vTemp = 1.0f,Value^2,Value^4,Value^6
+ vTemp = _mm_mul_ps(vTemp,g_XMCosEstCoefficients);
+ vValue = _mm_shuffle_ps(vValue,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
+ vValue = _mm_add_ps(vValue,vTemp); // Add Z = X+Z; W = Y+W;
+ vTemp = _mm_shuffle_ps(vTemp,vValue,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position
+ vTemp = _mm_add_ps(vTemp,vValue); // Add Z and W together
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return
+#if defined(_MSC_VER) && (_MSC_VER>=1500)
+ return _mm_cvtss_f32(vTemp);
+#else
+ return vTemp.m128_f32[0];
+#endif
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMScalarSinCosEst
+(
+ FLOAT* pSin,
+ FLOAT* pCos,
+ FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT ValueSq;
+ XMVECTOR V, Sin, Cos;
+ XMVECTOR Y;
+
+ XMASSERT(pSin);
+ XMASSERT(pCos);
+ XMASSERT(Value >= -XM_PI);
+ XMASSERT(Value < XM_PI);
+
+ // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI)
+ // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! (for -PI <= V < PI)
+
+ ValueSq = Value * Value;
+ V = XMVectorSet(1.0f, Value, ValueSq, Value * ValueSq);
+ Y = XMVectorSplatY(V);
+ Cos = XMVectorMultiply(V, V);
+ Sin = XMVectorMultiply(Cos, Y);
+
+ Cos = XMVector4Dot(Cos, g_XMCosEstCoefficients.v);
+ Sin = XMVector4Dot(Sin, g_XMSinEstCoefficients.v);
+
+ *pCos = Cos.vector4_f32[0];
+ *pSin = Sin.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSin);
+ XMASSERT(pCos);
+ XMASSERT(Value >= -XM_PI);
+ XMASSERT(Value < XM_PI);
+ float ValueSq = Value * Value;
+ XMVECTOR Cos = _mm_set_ps(Value * ValueSq,ValueSq,Value,1.0f);
+ XMVECTOR Sin = _mm_set_ps1(Value);
+ Cos = _mm_mul_ps(Cos,Cos);
+ Sin = _mm_mul_ps(Sin,Cos);
+ // Cos = 1.0f,Value^2,Value^4,Value^6
+ Cos = XMVector4Dot(Cos,g_XMCosEstCoefficients);
+ _mm_store_ss(pCos,Cos);
+ // Sin = Value,Value^3,Value^5,Value^7
+ Sin = XMVector4Dot(Sin, g_XMSinEstCoefficients);
+ _mm_store_ss(pSin,Sin);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE FLOAT XMScalarASinEst
+(
+ FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR VR, CR, CS;
+ XMVECTOR Result;
+ FLOAT AbsV, V2, D;
+ CONST FLOAT OnePlusEps = 1.00000011921f;
+
+ *(UINT*)&AbsV = *(const UINT*)&Value & 0x7FFFFFFF;
+ V2 = Value * AbsV;
+ D = OnePlusEps - AbsV;
+
+ CS = XMVectorSet(Value, 1.0f, 1.0f, V2);
+ VR = XMVectorSet(sqrtf(D), Value, V2, D * AbsV);
+ CR = XMVectorMultiply(CS, g_XMASinEstCoefficients.v);
+
+ Result = XMVector4Dot(VR, CR);
+
+ return Result.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ CONST FLOAT OnePlusEps = 1.00000011921f;
+ FLOAT AbsV = fabsf(Value);
+ FLOAT V2 = Value * AbsV; // Square with sign retained
+ FLOAT D = OnePlusEps - AbsV;
+
+ XMVECTOR Result = _mm_set_ps(V2,1.0f,1.0f,Value);
+ XMVECTOR VR = _mm_set_ps(D * AbsV,V2,Value,sqrtf(D));
+ Result = _mm_mul_ps(Result, g_XMASinEstCoefficients);
+ Result = XMVector4Dot(VR,Result);
+#if defined(_MSC_VER) && (_MSC_VER>=1500)
+ return _mm_cvtss_f32(Result);
+#else
+ return Result.m128_f32[0];
+#endif
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE FLOAT XMScalarACosEst
+(
+ FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR VR, CR, CS;
+ XMVECTOR Result;
+ FLOAT AbsV, V2, D;
+ CONST FLOAT OnePlusEps = 1.00000011921f;
+
+ // return XM_PIDIV2 - XMScalarASin(Value);
+
+ *(UINT*)&AbsV = *(const UINT*)&Value & 0x7FFFFFFF;
+ V2 = Value * AbsV;
+ D = OnePlusEps - AbsV;
+
+ CS = XMVectorSet(Value, 1.0f, 1.0f, V2);
+ VR = XMVectorSet(sqrtf(D), Value, V2, D * AbsV);
+ CR = XMVectorMultiply(CS, g_XMASinEstCoefficients.v);
+
+ Result = XMVector4Dot(VR, CR);
+
+ return XM_PIDIV2 - Result.vector4_f32[0];
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ CONST FLOAT OnePlusEps = 1.00000011921f;
+ FLOAT AbsV = fabsf(Value);
+ FLOAT V2 = Value * AbsV; // Value^2 retaining sign
+ FLOAT D = OnePlusEps - AbsV;
+ XMVECTOR Result = _mm_set_ps(V2,1.0f,1.0f,Value);
+ XMVECTOR VR = _mm_set_ps(D * AbsV,V2,Value,sqrtf(D));
+ Result = _mm_mul_ps(Result,g_XMASinEstCoefficients);
+ Result = XMVector4Dot(VR,Result);
+#if defined(_MSC_VER) && (_MSC_VER>=1500)
+ return XM_PIDIV2 - _mm_cvtss_f32(Result);
+#else
+ return XM_PIDIV2 - Result.m128_f32[0];
+#endif
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+#endif // __XNAMATHMISC_INL__
+
diff --git a/thirdparty/directxtex/XNAMath/xnamathvector.inl b/thirdparty/directxtex/XNAMath/xnamathvector.inl
new file mode 100644
index 00000000..37b7d132
--- /dev/null
+++ b/thirdparty/directxtex/XNAMath/xnamathvector.inl
@@ -0,0 +1,13673 @@
+/************************************************************************
+* *
+* xnamathvector.inl -- SIMD C++ Math library for Windows and Xbox 360 *
+* Vector functions *
+* *
+* Copyright (c) Microsoft Corp. All rights reserved. *
+* *
+************************************************************************/
+
+#if defined(_MSC_VER) && (_MSC_VER > 1000)
+#pragma once
+#endif
+
+#ifndef __XNAMATHVECTOR_INL__
+#define __XNAMATHVECTOR_INL__
+
+#if defined(_XM_NO_INTRINSICS_)
+#define XMISNAN(x) ((*(UINT*)&(x) & 0x7F800000) == 0x7F800000 && (*(UINT*)&(x) & 0x7FFFFF) != 0)
+#define XMISINF(x) ((*(UINT*)&(x) & 0x7FFFFFFF) == 0x7F800000)
+#endif
+
+/****************************************************************************
+ *
+ * General Vector
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Assignment operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Return a vector with all elements equaling zero
+XMFINLINE XMVECTOR XMVectorZero()
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f};
+ return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_setzero_ps();
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with four floating point values
+XMFINLINE XMVECTOR XMVectorSet
+(
+ FLOAT x,
+ FLOAT y,
+ FLOAT z,
+ FLOAT w
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTORF32 vResult = {x,y,z,w};
+ return vResult.v;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_set_ps( w, z, y, x );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with four integer values
+XMFINLINE XMVECTOR XMVectorSetInt
+(
+ UINT x,
+ UINT y,
+ UINT z,
+ UINT w
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTORU32 vResult = {x,y,z,w};
+ return vResult.v;
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i V = _mm_set_epi32( w, z, y, x );
+ return reinterpret_cast<__m128 *>(&V)[0];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with a replicated floating point value
+XMFINLINE XMVECTOR XMVectorReplicate
+(
+ FLOAT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+ XMVECTORF32 vResult = {Value,Value,Value,Value};
+ return vResult.v;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_set_ps1( Value );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with a replicated floating point value passed by pointer
+XMFINLINE XMVECTOR XMVectorReplicatePtr
+(
+ CONST FLOAT *pValue
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+ FLOAT Value = pValue[0];
+ XMVECTORF32 vResult = {Value,Value,Value,Value};
+ return vResult.v;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_load_ps1( pValue );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with a replicated integer value
+XMFINLINE XMVECTOR XMVectorReplicateInt
+(
+ UINT Value
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+ XMVECTORU32 vResult = {Value,Value,Value,Value};
+ return vResult.v;
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i vTemp = _mm_set1_epi32( Value );
+ return reinterpret_cast<const __m128 *>(&vTemp)[0];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with a replicated integer value passed by pointer
+XMFINLINE XMVECTOR XMVectorReplicateIntPtr
+(
+ CONST UINT *pValue
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+ UINT Value = pValue[0];
+ XMVECTORU32 vResult = {Value,Value,Value,Value};
+ return vResult.v;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_load_ps1(reinterpret_cast<const float *>(pValue));
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with all bits set (true mask)
+XMFINLINE XMVECTOR XMVectorTrueInt()
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTORU32 vResult = {0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU};
+ return vResult.v;
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i V = _mm_set1_epi32(-1);
+ return reinterpret_cast<__m128 *>(&V)[0];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with all bits clear (false mask)
+XMFINLINE XMVECTOR XMVectorFalseInt()
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f};
+ return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_setzero_ps();
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Replicate the x component of the vector
+XMFINLINE XMVECTOR XMVectorSplatX
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult;
+ vResult.vector4_f32[0] =
+ vResult.vector4_f32[1] =
+ vResult.vector4_f32[2] =
+ vResult.vector4_f32[3] = V.vector4_f32[0];
+ return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_shuffle_ps( V, V, _MM_SHUFFLE(0, 0, 0, 0) );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Replicate the y component of the vector
+XMFINLINE XMVECTOR XMVectorSplatY
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult;
+ vResult.vector4_f32[0] =
+ vResult.vector4_f32[1] =
+ vResult.vector4_f32[2] =
+ vResult.vector4_f32[3] = V.vector4_f32[1];
+ return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_shuffle_ps( V, V, _MM_SHUFFLE(1, 1, 1, 1) );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Replicate the z component of the vector
+XMFINLINE XMVECTOR XMVectorSplatZ
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult;
+ vResult.vector4_f32[0] =
+ vResult.vector4_f32[1] =
+ vResult.vector4_f32[2] =
+ vResult.vector4_f32[3] = V.vector4_f32[2];
+ return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_shuffle_ps( V, V, _MM_SHUFFLE(2, 2, 2, 2) );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Replicate the w component of the vector
+XMFINLINE XMVECTOR XMVectorSplatW
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult;
+ vResult.vector4_f32[0] =
+ vResult.vector4_f32[1] =
+ vResult.vector4_f32[2] =
+ vResult.vector4_f32[3] = V.vector4_f32[3];
+ return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_shuffle_ps( V, V, _MM_SHUFFLE(3, 3, 3, 3) );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of 1.0f,1.0f,1.0f,1.0f
+XMFINLINE XMVECTOR XMVectorSplatOne()
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult;
+ vResult.vector4_f32[0] =
+ vResult.vector4_f32[1] =
+ vResult.vector4_f32[2] =
+ vResult.vector4_f32[3] = 1.0f;
+ return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return g_XMOne;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of INF,INF,INF,INF
+XMFINLINE XMVECTOR XMVectorSplatInfinity()
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult;
+ vResult.vector4_u32[0] =
+ vResult.vector4_u32[1] =
+ vResult.vector4_u32[2] =
+ vResult.vector4_u32[3] = 0x7F800000;
+ return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return g_XMInfinity;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN
+XMFINLINE XMVECTOR XMVectorSplatQNaN()
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult;
+ vResult.vector4_u32[0] =
+ vResult.vector4_u32[1] =
+ vResult.vector4_u32[2] =
+ vResult.vector4_u32[3] = 0x7FC00000;
+ return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return g_XMQNaN;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f
+XMFINLINE XMVECTOR XMVectorSplatEpsilon()
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult;
+ vResult.vector4_u32[0] =
+ vResult.vector4_u32[1] =
+ vResult.vector4_u32[2] =
+ vResult.vector4_u32[3] = 0x34000000;
+ return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return g_XMEpsilon;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f
+XMFINLINE XMVECTOR XMVectorSplatSignMask()
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult;
+ vResult.vector4_u32[0] =
+ vResult.vector4_u32[1] =
+ vResult.vector4_u32[2] =
+ vResult.vector4_u32[3] = 0x80000000U;
+ return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i V = _mm_set1_epi32( 0x80000000 );
+ return reinterpret_cast<__m128*>(&V)[0];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Return a floating point value via an index. This is not a recommended
+// function to use due to performance loss.
+XMFINLINE FLOAT XMVectorGetByIndex(FXMVECTOR V,UINT i)
+{
+ XMASSERT( i <= 3 );
+#if defined(_XM_NO_INTRINSICS_)
+ return V.vector4_f32[i];
+#elif defined(_XM_SSE_INTRINSICS_)
+ return V.m128_f32[i];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Return the X component in an FPU register.
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE FLOAT XMVectorGetX(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return V.vector4_f32[0];
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_MSC_VER) && (_MSC_VER>=1500)
+ return _mm_cvtss_f32(V);
+#else
+ return V.m128_f32[0];
+#endif
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Return the Y component in an FPU register.
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE FLOAT XMVectorGetY(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return V.vector4_f32[1];
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_MSC_VER) && (_MSC_VER>=1500)
+ XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+ return _mm_cvtss_f32(vTemp);
+#else
+ return V.m128_f32[1];
+#endif
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Return the Z component in an FPU register.
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE FLOAT XMVectorGetZ(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return V.vector4_f32[2];
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_MSC_VER) && (_MSC_VER>=1500)
+ XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+ return _mm_cvtss_f32(vTemp);
+#else
+ return V.m128_f32[2];
+#endif
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Return the W component in an FPU register.
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE FLOAT XMVectorGetW(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return V.vector4_f32[3];
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_MSC_VER) && (_MSC_VER>=1500)
+ XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,3,3,3));
+ return _mm_cvtss_f32(vTemp);
+#else
+ return V.m128_f32[3];
+#endif
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Store a component indexed by i into a 32 bit float location in memory.
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE VOID XMVectorGetByIndexPtr(FLOAT *f,FXMVECTOR V,UINT i)
+{
+ XMASSERT( f != 0 );
+ XMASSERT( i < 4 );
+#if defined(_XM_NO_INTRINSICS_)
+ *f = V.vector4_f32[i];
+#elif defined(_XM_SSE_INTRINSICS_)
+ *f = V.m128_f32[i];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Store the X component into a 32 bit float location in memory.
+XMFINLINE VOID XMVectorGetXPtr(FLOAT *x,FXMVECTOR V)
+{
+ XMASSERT( x != 0 );
+#if defined(_XM_NO_INTRINSICS_)
+ *x = V.vector4_f32[0];
+#elif defined(_XM_SSE_INTRINSICS_)
+ _mm_store_ss(x,V);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Store the Y component into a 32 bit float location in memory.
+XMFINLINE VOID XMVectorGetYPtr(FLOAT *y,FXMVECTOR V)
+{
+ XMASSERT( y != 0 );
+#if defined(_XM_NO_INTRINSICS_)
+ *y = V.vector4_f32[1];
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+ _mm_store_ss(y,vResult);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Store the Z component into a 32 bit float location in memory.
+XMFINLINE VOID XMVectorGetZPtr(FLOAT *z,FXMVECTOR V)
+{
+ XMASSERT( z != 0 );
+#if defined(_XM_NO_INTRINSICS_)
+ *z = V.vector4_f32[2];
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+ _mm_store_ss(z,vResult);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Store the W component into a 32 bit float location in memory.
+XMFINLINE VOID XMVectorGetWPtr(FLOAT *w,FXMVECTOR V)
+{
+ XMASSERT( w != 0 );
+#if defined(_XM_NO_INTRINSICS_)
+ *w = V.vector4_f32[3];
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,3,3,3));
+ _mm_store_ss(w,vResult);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Return an integer value via an index. This is not a recommended
+// function to use due to performance loss.
+XMFINLINE UINT XMVectorGetIntByIndex(FXMVECTOR V, UINT i)
+{
+ XMASSERT( i < 4 );
+#if defined(_XM_NO_INTRINSICS_)
+ return V.vector4_u32[i];
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_MSC_VER) && (_MSC_VER<1400)
+ XMVECTORU32 tmp;
+ tmp.v = V;
+ return tmp.u[i];
+#else
+ return V.m128_u32[i];
+#endif
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Return the X component in an integer register.
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE UINT XMVectorGetIntX(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return V.vector4_u32[0];
+#elif defined(_XM_SSE_INTRINSICS_)
+ return static_cast<UINT>(_mm_cvtsi128_si32(reinterpret_cast<const __m128i *>(&V)[0]));
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Return the Y component in an integer register.
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE UINT XMVectorGetIntY(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return V.vector4_u32[1];
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i vResulti = _mm_shuffle_epi32(reinterpret_cast<const __m128i *>(&V)[0],_MM_SHUFFLE(1,1,1,1));
+ return static_cast<UINT>(_mm_cvtsi128_si32(vResulti));
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Return the Z component in an integer register.
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE UINT XMVectorGetIntZ(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return V.vector4_u32[2];
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i vResulti = _mm_shuffle_epi32(reinterpret_cast<const __m128i *>(&V)[0],_MM_SHUFFLE(2,2,2,2));
+ return static_cast<UINT>(_mm_cvtsi128_si32(vResulti));
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Return the W component in an integer register.
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE UINT XMVectorGetIntW(FXMVECTOR V)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return V.vector4_u32[3];
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i vResulti = _mm_shuffle_epi32(reinterpret_cast<const __m128i *>(&V)[0],_MM_SHUFFLE(3,3,3,3));
+ return static_cast<UINT>(_mm_cvtsi128_si32(vResulti));
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Store a component indexed by i into a 32 bit integer location in memory.
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE VOID XMVectorGetIntByIndexPtr(UINT *x,FXMVECTOR V,UINT i)
+{
+ XMASSERT( x != 0 );
+ XMASSERT( i < 4 );
+#if defined(_XM_NO_INTRINSICS_)
+ *x = V.vector4_u32[i];
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_MSC_VER) && (_MSC_VER<1400)
+ XMVECTORU32 tmp;
+ tmp.v = V;
+ *x = tmp.u[i];
+#else
+ *x = V.m128_u32[i];
+#endif
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Store the X component into a 32 bit integer location in memory.
+XMFINLINE VOID XMVectorGetIntXPtr(UINT *x,FXMVECTOR V)
+{
+ XMASSERT( x != 0 );
+#if defined(_XM_NO_INTRINSICS_)
+ *x = V.vector4_u32[0];
+#elif defined(_XM_SSE_INTRINSICS_)
+ _mm_store_ss(reinterpret_cast<float *>(x),V);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Store the Y component into a 32 bit integer location in memory.
+XMFINLINE VOID XMVectorGetIntYPtr(UINT *y,FXMVECTOR V)
+{
+ XMASSERT( y != 0 );
+#if defined(_XM_NO_INTRINSICS_)
+ *y = V.vector4_u32[1];
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+ _mm_store_ss(reinterpret_cast<float *>(y),vResult);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Store the Z component into a 32 bit integer locaCantion in memory.
+XMFINLINE VOID XMVectorGetIntZPtr(UINT *z,FXMVECTOR V)
+{
+ XMASSERT( z != 0 );
+#if defined(_XM_NO_INTRINSICS_)
+ *z = V.vector4_u32[2];
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+ _mm_store_ss(reinterpret_cast<float *>(z),vResult);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Store the W component into a 32 bit integer location in memory.
+XMFINLINE VOID XMVectorGetIntWPtr(UINT *w,FXMVECTOR V)
+{
+ XMASSERT( w != 0 );
+#if defined(_XM_NO_INTRINSICS_)
+ *w = V.vector4_u32[3];
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,3,3,3));
+ _mm_store_ss(reinterpret_cast<float *>(w),vResult);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Set a single indexed floating point component
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE XMVECTOR XMVectorSetByIndex(FXMVECTOR V, FLOAT f,UINT i)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ XMASSERT( i <= 3 );
+ U = V;
+ U.vector4_f32[i] = f;
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( i <= 3 );
+ XMVECTOR U = V;
+ U.m128_f32[i] = f;
+ return U;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Sets the X component of a vector to a passed floating point value
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE XMVECTOR XMVectorSetX(FXMVECTOR V, FLOAT x)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ U.vector4_f32[0] = x;
+ U.vector4_f32[1] = V.vector4_f32[1];
+ U.vector4_f32[2] = V.vector4_f32[2];
+ U.vector4_f32[3] = V.vector4_f32[3];
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_XM_ISVS2005_)
+ XMVECTOR vResult = V;
+ vResult.m128_f32[0] = x;
+ return vResult;
+#else
+ XMVECTOR vResult = _mm_set_ss(x);
+ vResult = _mm_move_ss(V,vResult);
+ return vResult;
+#endif // _XM_ISVS2005_
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Sets the Y component of a vector to a passed floating point value
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE XMVECTOR XMVectorSetY(FXMVECTOR V, FLOAT y)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ U.vector4_f32[0] = V.vector4_f32[0];
+ U.vector4_f32[1] = y;
+ U.vector4_f32[2] = V.vector4_f32[2];
+ U.vector4_f32[3] = V.vector4_f32[3];
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_XM_ISVS2005_)
+ XMVECTOR vResult = V;
+ vResult.m128_f32[1] = y;
+ return vResult;
+#else
+ // Swap y and x
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,2,0,1));
+ // Convert input to vector
+ XMVECTOR vTemp = _mm_set_ss(y);
+ // Replace the x component
+ vResult = _mm_move_ss(vResult,vTemp);
+ // Swap y and x again
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,2,0,1));
+ return vResult;
+#endif // _XM_ISVS2005_
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+// Sets the Z component of a vector to a passed floating point value
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE XMVECTOR XMVectorSetZ(FXMVECTOR V, FLOAT z)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ U.vector4_f32[0] = V.vector4_f32[0];
+ U.vector4_f32[1] = V.vector4_f32[1];
+ U.vector4_f32[2] = z;
+ U.vector4_f32[3] = V.vector4_f32[3];
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_XM_ISVS2005_)
+ XMVECTOR vResult = V;
+ vResult.m128_f32[2] = z;
+ return vResult;
+#else
+ // Swap z and x
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,0,1,2));
+ // Convert input to vector
+ XMVECTOR vTemp = _mm_set_ss(z);
+ // Replace the x component
+ vResult = _mm_move_ss(vResult,vTemp);
+ // Swap z and x again
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2));
+ return vResult;
+#endif // _XM_ISVS2005_
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Sets the W component of a vector to a passed floating point value
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE XMVECTOR XMVectorSetW(FXMVECTOR V, FLOAT w)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ U.vector4_f32[0] = V.vector4_f32[0];
+ U.vector4_f32[1] = V.vector4_f32[1];
+ U.vector4_f32[2] = V.vector4_f32[2];
+ U.vector4_f32[3] = w;
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_XM_ISVS2005_)
+ XMVECTOR vResult = V;
+ vResult.m128_f32[3] = w;
+ return vResult;
+#else
+ // Swap w and x
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,2,1,3));
+ // Convert input to vector
+ XMVECTOR vTemp = _mm_set_ss(w);
+ // Replace the x component
+ vResult = _mm_move_ss(vResult,vTemp);
+ // Swap w and x again
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,2,1,3));
+ return vResult;
+#endif // _XM_ISVS2005_
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Sets a component of a vector to a floating point value passed by pointer
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE XMVECTOR XMVectorSetByIndexPtr(FXMVECTOR V,CONST FLOAT *f,UINT i)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ XMASSERT( f != 0 );
+ XMASSERT( i <= 3 );
+ U = V;
+ U.vector4_f32[i] = *f;
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( f != 0 );
+ XMASSERT( i <= 3 );
+ XMVECTOR U = V;
+ U.m128_f32[i] = *f;
+ return U;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Sets the X component of a vector to a floating point value passed by pointer
+XMFINLINE XMVECTOR XMVectorSetXPtr(FXMVECTOR V,CONST FLOAT *x)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ XMASSERT( x != 0 );
+ U.vector4_f32[0] = *x;
+ U.vector4_f32[1] = V.vector4_f32[1];
+ U.vector4_f32[2] = V.vector4_f32[2];
+ U.vector4_f32[3] = V.vector4_f32[3];
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( x != 0 );
+ XMVECTOR vResult = _mm_load_ss(x);
+ vResult = _mm_move_ss(V,vResult);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Sets the Y component of a vector to a floating point value passed by pointer
+XMFINLINE XMVECTOR XMVectorSetYPtr(FXMVECTOR V,CONST FLOAT *y)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ XMASSERT( y != 0 );
+ U.vector4_f32[0] = V.vector4_f32[0];
+ U.vector4_f32[1] = *y;
+ U.vector4_f32[2] = V.vector4_f32[2];
+ U.vector4_f32[3] = V.vector4_f32[3];
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( y != 0 );
+ // Swap y and x
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,2,0,1));
+ // Convert input to vector
+ XMVECTOR vTemp = _mm_load_ss(y);
+ // Replace the x component
+ vResult = _mm_move_ss(vResult,vTemp);
+ // Swap y and x again
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,2,0,1));
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Sets the Z component of a vector to a floating point value passed by pointer
+XMFINLINE XMVECTOR XMVectorSetZPtr(FXMVECTOR V,CONST FLOAT *z)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ XMASSERT( z != 0 );
+ U.vector4_f32[0] = V.vector4_f32[0];
+ U.vector4_f32[1] = V.vector4_f32[1];
+ U.vector4_f32[2] = *z;
+ U.vector4_f32[3] = V.vector4_f32[3];
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( z != 0 );
+ // Swap z and x
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,0,1,2));
+ // Convert input to vector
+ XMVECTOR vTemp = _mm_load_ss(z);
+ // Replace the x component
+ vResult = _mm_move_ss(vResult,vTemp);
+ // Swap z and x again
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2));
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Sets the W component of a vector to a floating point value passed by pointer
+XMFINLINE XMVECTOR XMVectorSetWPtr(FXMVECTOR V,CONST FLOAT *w)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ XMASSERT( w != 0 );
+ U.vector4_f32[0] = V.vector4_f32[0];
+ U.vector4_f32[1] = V.vector4_f32[1];
+ U.vector4_f32[2] = V.vector4_f32[2];
+ U.vector4_f32[3] = *w;
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( w != 0 );
+ // Swap w and x
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,2,1,3));
+ // Convert input to vector
+ XMVECTOR vTemp = _mm_load_ss(w);
+ // Replace the x component
+ vResult = _mm_move_ss(vResult,vTemp);
+ // Swap w and x again
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,2,1,3));
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Sets a component of a vector to an integer passed by value
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE XMVECTOR XMVectorSetIntByIndex(FXMVECTOR V, UINT x, UINT i)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ XMASSERT( i <= 3 );
+ U = V;
+ U.vector4_u32[i] = x;
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( i <= 3 );
+ XMVECTORU32 tmp;
+ tmp.v = V;
+ tmp.u[i] = x;
+ return tmp;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Sets the X component of a vector to an integer passed by value
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE XMVECTOR XMVectorSetIntX(FXMVECTOR V, UINT x)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ U.vector4_u32[0] = x;
+ U.vector4_u32[1] = V.vector4_u32[1];
+ U.vector4_u32[2] = V.vector4_u32[2];
+ U.vector4_u32[3] = V.vector4_u32[3];
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_XM_ISVS2005_)
+ XMVECTOR vResult = V;
+ vResult.m128_i32[0] = x;
+ return vResult;
+#else
+ __m128i vTemp = _mm_cvtsi32_si128(x);
+ XMVECTOR vResult = _mm_move_ss(V,reinterpret_cast<const __m128 *>(&vTemp)[0]);
+ return vResult;
+#endif // _XM_ISVS2005_
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Sets the Y component of a vector to an integer passed by value
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE XMVECTOR XMVectorSetIntY(FXMVECTOR V, UINT y)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ U.vector4_u32[0] = V.vector4_u32[0];
+ U.vector4_u32[1] = y;
+ U.vector4_u32[2] = V.vector4_u32[2];
+ U.vector4_u32[3] = V.vector4_u32[3];
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_XM_ISVS2005_)
+ XMVECTOR vResult = V;
+ vResult.m128_i32[1] = y;
+ return vResult;
+#else // Swap y and x
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,2,0,1));
+ // Convert input to vector
+ __m128i vTemp = _mm_cvtsi32_si128(y);
+ // Replace the x component
+ vResult = _mm_move_ss(vResult,reinterpret_cast<const __m128 *>(&vTemp)[0]);
+ // Swap y and x again
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,2,0,1));
+ return vResult;
+#endif // _XM_ISVS2005_
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Sets the Z component of a vector to an integer passed by value
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE XMVECTOR XMVectorSetIntZ(FXMVECTOR V, UINT z)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ U.vector4_u32[0] = V.vector4_u32[0];
+ U.vector4_u32[1] = V.vector4_u32[1];
+ U.vector4_u32[2] = z;
+ U.vector4_u32[3] = V.vector4_u32[3];
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_XM_ISVS2005_)
+ XMVECTOR vResult = V;
+ vResult.m128_i32[2] = z;
+ return vResult;
+#else
+ // Swap z and x
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,0,1,2));
+ // Convert input to vector
+ __m128i vTemp = _mm_cvtsi32_si128(z);
+ // Replace the x component
+ vResult = _mm_move_ss(vResult,reinterpret_cast<const __m128 *>(&vTemp)[0]);
+ // Swap z and x again
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2));
+ return vResult;
+#endif // _XM_ISVS2005_
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Sets the W component of a vector to an integer passed by value
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE XMVECTOR XMVectorSetIntW(FXMVECTOR V, UINT w)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ U.vector4_u32[0] = V.vector4_u32[0];
+ U.vector4_u32[1] = V.vector4_u32[1];
+ U.vector4_u32[2] = V.vector4_u32[2];
+ U.vector4_u32[3] = w;
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_XM_ISVS2005_)
+ XMVECTOR vResult = V;
+ vResult.m128_i32[3] = w;
+ return vResult;
+#else
+ // Swap w and x
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,2,1,3));
+ // Convert input to vector
+ __m128i vTemp = _mm_cvtsi32_si128(w);
+ // Replace the x component
+ vResult = _mm_move_ss(vResult,reinterpret_cast<const __m128 *>(&vTemp)[0]);
+ // Swap w and x again
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,2,1,3));
+ return vResult;
+#endif // _XM_ISVS2005_
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Sets a component of a vector to an integer value passed by pointer
+// This causes Load/Hit/Store on VMX targets
+XMFINLINE XMVECTOR XMVectorSetIntByIndexPtr(FXMVECTOR V, CONST UINT *x,UINT i)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ XMASSERT( x != 0 );
+ XMASSERT( i <= 3 );
+ U = V;
+ U.vector4_u32[i] = *x;
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( x != 0 );
+ XMASSERT( i <= 3 );
+ XMVECTORU32 tmp;
+ tmp.v = V;
+ tmp.u[i] = *x;
+ return tmp;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Sets the X component of a vector to an integer value passed by pointer
+XMFINLINE XMVECTOR XMVectorSetIntXPtr(FXMVECTOR V,CONST UINT *x)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ XMASSERT( x != 0 );
+ U.vector4_u32[0] = *x;
+ U.vector4_u32[1] = V.vector4_u32[1];
+ U.vector4_u32[2] = V.vector4_u32[2];
+ U.vector4_u32[3] = V.vector4_u32[3];
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( x != 0 );
+ XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(x));
+ XMVECTOR vResult = _mm_move_ss(V,vTemp);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Sets the Y component of a vector to an integer value passed by pointer
+XMFINLINE XMVECTOR XMVectorSetIntYPtr(FXMVECTOR V,CONST UINT *y)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ XMASSERT( y != 0 );
+ U.vector4_u32[0] = V.vector4_u32[0];
+ U.vector4_u32[1] = *y;
+ U.vector4_u32[2] = V.vector4_u32[2];
+ U.vector4_u32[3] = V.vector4_u32[3];
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( y != 0 );
+ // Swap y and x
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,2,0,1));
+ // Convert input to vector
+ XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(y));
+ // Replace the x component
+ vResult = _mm_move_ss(vResult,vTemp);
+ // Swap y and x again
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,2,0,1));
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Sets the Z component of a vector to an integer value passed by pointer
+XMFINLINE XMVECTOR XMVectorSetIntZPtr(FXMVECTOR V,CONST UINT *z)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ XMASSERT( z != 0 );
+ U.vector4_u32[0] = V.vector4_u32[0];
+ U.vector4_u32[1] = V.vector4_u32[1];
+ U.vector4_u32[2] = *z;
+ U.vector4_u32[3] = V.vector4_u32[3];
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( z != 0 );
+ // Swap z and x
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,0,1,2));
+ // Convert input to vector
+ XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(z));
+ // Replace the x component
+ vResult = _mm_move_ss(vResult,vTemp);
+ // Swap z and x again
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2));
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+// Sets the W component of a vector to an integer value passed by pointer
+XMFINLINE XMVECTOR XMVectorSetIntWPtr(FXMVECTOR V,CONST UINT *w)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR U;
+ XMASSERT( w != 0 );
+ U.vector4_u32[0] = V.vector4_u32[0];
+ U.vector4_u32[1] = V.vector4_u32[1];
+ U.vector4_u32[2] = V.vector4_u32[2];
+ U.vector4_u32[3] = *w;
+ return U;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( w != 0 );
+ // Swap w and x
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,2,1,3));
+ // Convert input to vector
+ XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(w));
+ // Replace the x component
+ vResult = _mm_move_ss(vResult,vTemp);
+ // Swap w and x again
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,2,1,3));
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Define a control vector to be used in XMVectorPermute
+// operations. Visualize the two vectors V1 and V2 given
+// in a permute as arranged back to back in a linear fashion,
+// such that they form an array of 8 floating point values.
+// The four integers specified in XMVectorPermuteControl
+// will serve as indices into the array to select components
+// from the two vectors. ElementIndex0 is used to select
+// an element from the vectors to be placed in the first
+// component of the resulting vector, ElementIndex1 is used
+// to select an element for the second component, etc.
+
+XMFINLINE XMVECTOR XMVectorPermuteControl
+(
+ UINT ElementIndex0,
+ UINT ElementIndex1,
+ UINT ElementIndex2,
+ UINT ElementIndex3
+)
+{
+#if defined(_XM_SSE_INTRINSICS_) || defined(_XM_NO_INTRINSICS_)
+ XMVECTORU32 vControl;
+ static CONST UINT ControlElement[] = {
+ XM_PERMUTE_0X,
+ XM_PERMUTE_0Y,
+ XM_PERMUTE_0Z,
+ XM_PERMUTE_0W,
+ XM_PERMUTE_1X,
+ XM_PERMUTE_1Y,
+ XM_PERMUTE_1Z,
+ XM_PERMUTE_1W
+ };
+ XMASSERT(ElementIndex0 < 8);
+ XMASSERT(ElementIndex1 < 8);
+ XMASSERT(ElementIndex2 < 8);
+ XMASSERT(ElementIndex3 < 8);
+
+ vControl.u[0] = ControlElement[ElementIndex0];
+ vControl.u[1] = ControlElement[ElementIndex1];
+ vControl.u[2] = ControlElement[ElementIndex2];
+ vControl.u[3] = ControlElement[ElementIndex3];
+ return vControl.v;
+#else
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Using a control vector made up of 16 bytes from 0-31, remap V1 and V2's byte
+// entries into a single 16 byte vector and return it. Index 0-15 = V1,
+// 16-31 = V2
+XMFINLINE XMVECTOR XMVectorPermute
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2,
+ FXMVECTOR Control
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ const BYTE *aByte[2];
+ XMVECTOR Result;
+ UINT i, uIndex, VectorIndex;
+ const BYTE *pControl;
+ BYTE *pWork;
+
+ // Indices must be in range from 0 to 31
+ XMASSERT((Control.vector4_u32[0] & 0xE0E0E0E0) == 0);
+ XMASSERT((Control.vector4_u32[1] & 0xE0E0E0E0) == 0);
+ XMASSERT((Control.vector4_u32[2] & 0xE0E0E0E0) == 0);
+ XMASSERT((Control.vector4_u32[3] & 0xE0E0E0E0) == 0);
+
+ // 0-15 = V1, 16-31 = V2
+ aByte[0] = (const BYTE*)(&V1);
+ aByte[1] = (const BYTE*)(&V2);
+ i = 16;
+ pControl = (const BYTE *)(&Control);
+ pWork = (BYTE *)(&Result);
+ do {
+ // Get the byte to map from
+ uIndex = pControl[0];
+ ++pControl;
+ VectorIndex = (uIndex>>4)&1;
+ uIndex &= 0x0F;
+#if defined(_XM_LITTLEENDIAN_)
+ uIndex ^= 3; // Swap byte ordering on little endian machines
+#endif
+ pWork[0] = aByte[VectorIndex][uIndex];
+ ++pWork;
+ } while (--i);
+ return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+#if defined(_PREFAST_) || defined(XMDEBUG)
+ // Indices must be in range from 0 to 31
+ static const XMVECTORI32 PremuteTest = {0xE0E0E0E0,0xE0E0E0E0,0xE0E0E0E0,0xE0E0E0E0};
+ XMVECTOR vAssert = _mm_and_ps(Control,PremuteTest);
+ __m128i vAsserti = _mm_cmpeq_epi32(reinterpret_cast<const __m128i *>(&vAssert)[0],g_XMZero);
+ XMASSERT(_mm_movemask_ps(*reinterpret_cast<const __m128 *>(&vAsserti)) == 0xf);
+#endif
+ // Store the vectors onto local memory on the stack
+ XMVECTOR Array[2];
+ Array[0] = V1;
+ Array[1] = V2;
+ // Output vector, on the stack
+ XMVECTORU8 vResult;
+ // Get pointer to the two vectors on the stack
+ const BYTE *pInput = reinterpret_cast<const BYTE *>(Array);
+ // Store the Control vector on the stack to access the bytes
+ // don't use Control, it can cause a register variable to spill on the stack.
+ XMVECTORU8 vControl;
+ vControl.v = Control; // Write to memory
+ UINT i = 0;
+ do {
+ UINT ComponentIndex = vControl.u[i] & 0x1FU;
+ ComponentIndex ^= 3; // Swap byte ordering
+ vResult.u[i] = pInput[ComponentIndex];
+ } while (++i<16);
+ return vResult;
+#else // _XM_SSE_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Define a control vector to be used in XMVectorSelect
+// operations. The four integers specified in XMVectorSelectControl
+// serve as indices to select between components in two vectors.
+// The first index controls selection for the first component of
+// the vectors involved in a select operation, the second index
+// controls selection for the second component etc. A value of
+// zero for an index causes the corresponding component from the first
+// vector to be selected whereas a one causes the component from the
+// second vector to be selected instead.
+
+XMFINLINE XMVECTOR XMVectorSelectControl
+(
+ UINT VectorIndex0,
+ UINT VectorIndex1,
+ UINT VectorIndex2,
+ UINT VectorIndex3
+)
+{
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+ // x=Index0,y=Index1,z=Index2,w=Index3
+ __m128i vTemp = _mm_set_epi32(VectorIndex3,VectorIndex2,VectorIndex1,VectorIndex0);
+ // Any non-zero entries become 0xFFFFFFFF else 0
+ vTemp = _mm_cmpgt_epi32(vTemp,g_XMZero);
+ return reinterpret_cast<__m128 *>(&vTemp)[0];
+#else
+ XMVECTOR ControlVector;
+ CONST UINT ControlElement[] =
+ {
+ XM_SELECT_0,
+ XM_SELECT_1
+ };
+
+ XMASSERT(VectorIndex0 < 2);
+ XMASSERT(VectorIndex1 < 2);
+ XMASSERT(VectorIndex2 < 2);
+ XMASSERT(VectorIndex3 < 2);
+
+ ControlVector.vector4_u32[0] = ControlElement[VectorIndex0];
+ ControlVector.vector4_u32[1] = ControlElement[VectorIndex1];
+ ControlVector.vector4_u32[2] = ControlElement[VectorIndex2];
+ ControlVector.vector4_u32[3] = ControlElement[VectorIndex3];
+
+ return ControlVector;
+
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorSelect
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2,
+ FXMVECTOR Control
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_u32[0] = (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]);
+ Result.vector4_u32[1] = (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]);
+ Result.vector4_u32[2] = (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]);
+ Result.vector4_u32[3] = (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp1 = _mm_andnot_ps(Control,V1);
+ XMVECTOR vTemp2 = _mm_and_ps(V2,Control);
+ return _mm_or_ps(vTemp1,vTemp2);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorMergeXY
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_u32[0] = V1.vector4_u32[0];
+ Result.vector4_u32[1] = V2.vector4_u32[0];
+ Result.vector4_u32[2] = V1.vector4_u32[1];
+ Result.vector4_u32[3] = V2.vector4_u32[1];
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_unpacklo_ps( V1, V2 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorMergeZW
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_u32[0] = V1.vector4_u32[2];
+ Result.vector4_u32[1] = V2.vector4_u32[2];
+ Result.vector4_u32[2] = V1.vector4_u32[3];
+ Result.vector4_u32[3] = V2.vector4_u32[3];
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_unpackhi_ps( V1, V2 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Control;
+
+ Control.vector4_u32[0] = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[1] = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[2] = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[3] = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
+
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_cmpeq_ps( V1, V2 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorEqualR
+(
+ UINT* pCR,
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT ux, uy, uz, uw, CR;
+ XMVECTOR Control;
+
+ XMASSERT( pCR );
+
+ ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+ uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+ uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+ uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+ CR = 0;
+ if (ux&uy&uz&uw)
+ {
+ // All elements are greater
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!(ux|uy|uz|uw))
+ {
+ // All elements are not greater
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ *pCR = CR;
+ Control.vector4_u32[0] = ux;
+ Control.vector4_u32[1] = uy;
+ Control.vector4_u32[2] = uz;
+ Control.vector4_u32[3] = uw;
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( pCR );
+ XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+ UINT CR = 0;
+ int iTest = _mm_movemask_ps(vTemp);
+ if (iTest==0xf)
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!iTest)
+ {
+ // All elements are not greater
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ *pCR = CR;
+ return vTemp;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Treat the components of the vectors as unsigned integers and
+// compare individual bits between the two. This is useful for
+// comparing control vectors and result vectors returned from
+// other comparison operations.
+
+XMFINLINE XMVECTOR XMVectorEqualInt
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Control;
+
+ Control.vector4_u32[0] = (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[1] = (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[2] = (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[3] = (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0;
+
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i V = _mm_cmpeq_epi32( reinterpret_cast<const __m128i *>(&V1)[0],reinterpret_cast<const __m128i *>(&V2)[0] );
+ return reinterpret_cast<__m128 *>(&V)[0];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorEqualIntR
+(
+ UINT* pCR,
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Control;
+
+ XMASSERT(pCR);
+
+ Control = XMVectorEqualInt(V1, V2);
+
+ *pCR = 0;
+
+ if (XMVector4EqualInt(Control, XMVectorTrueInt()))
+ {
+ // All elements are equal
+ *pCR |= XM_CRMASK_CR6TRUE;
+ }
+ else if (XMVector4EqualInt(Control, XMVectorFalseInt()))
+ {
+ // All elements are not equal
+ *pCR |= XM_CRMASK_CR6FALSE;
+ }
+
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pCR);
+ __m128i V = _mm_cmpeq_epi32( reinterpret_cast<const __m128i *>(&V1)[0],reinterpret_cast<const __m128i *>(&V2)[0] );
+ int iTemp = _mm_movemask_ps(reinterpret_cast<const __m128*>(&V)[0]);
+ UINT CR = 0;
+ if (iTemp==0x0F)
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!iTemp)
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ *pCR = CR;
+ return reinterpret_cast<__m128 *>(&V)[0];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorNearEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2,
+ FXMVECTOR Epsilon
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT fDeltax, fDeltay, fDeltaz, fDeltaw;
+ XMVECTOR Control;
+
+ fDeltax = V1.vector4_f32[0]-V2.vector4_f32[0];
+ fDeltay = V1.vector4_f32[1]-V2.vector4_f32[1];
+ fDeltaz = V1.vector4_f32[2]-V2.vector4_f32[2];
+ fDeltaw = V1.vector4_f32[3]-V2.vector4_f32[3];
+
+ fDeltax = fabsf(fDeltax);
+ fDeltay = fabsf(fDeltay);
+ fDeltaz = fabsf(fDeltaz);
+ fDeltaw = fabsf(fDeltaw);
+
+ Control.vector4_u32[0] = (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+ Control.vector4_u32[1] = (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+ Control.vector4_u32[2] = (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+ Control.vector4_u32[3] = (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Get the difference
+ XMVECTOR vDelta = _mm_sub_ps(V1,V2);
+ // Get the absolute value of the difference
+ XMVECTOR vTemp = _mm_setzero_ps();
+ vTemp = _mm_sub_ps(vTemp,vDelta);
+ vTemp = _mm_max_ps(vTemp,vDelta);
+ vTemp = _mm_cmple_ps(vTemp,Epsilon);
+ return vTemp;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorNotEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Control;
+ Control.vector4_u32[0] = (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[1] = (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[2] = (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[3] = (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_cmpneq_ps( V1, V2 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorNotEqualInt
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Control;
+ Control.vector4_u32[0] = (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0;
+ Control.vector4_u32[1] = (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0;
+ Control.vector4_u32[2] = (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0;
+ Control.vector4_u32[3] = (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0;
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i V = _mm_cmpeq_epi32( reinterpret_cast<const __m128i *>(&V1)[0],reinterpret_cast<const __m128i *>(&V2)[0] );
+ return _mm_xor_ps(reinterpret_cast<__m128 *>(&V)[0],g_XMNegOneMask);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorGreater
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Control;
+ Control.vector4_u32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_cmpgt_ps( V1, V2 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorGreaterR
+(
+ UINT* pCR,
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT ux, uy, uz, uw, CR;
+ XMVECTOR Control;
+
+ XMASSERT( pCR );
+
+ ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+ uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+ uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+ uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+ CR = 0;
+ if (ux&uy&uz&uw)
+ {
+ // All elements are greater
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!(ux|uy|uz|uw))
+ {
+ // All elements are not greater
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ *pCR = CR;
+ Control.vector4_u32[0] = ux;
+ Control.vector4_u32[1] = uy;
+ Control.vector4_u32[2] = uz;
+ Control.vector4_u32[3] = uw;
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( pCR );
+ XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
+ UINT CR = 0;
+ int iTest = _mm_movemask_ps(vTemp);
+ if (iTest==0xf)
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!iTest)
+ {
+ // All elements are not greater
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ *pCR = CR;
+ return vTemp;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorGreaterOrEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Control;
+ Control.vector4_u32[0] = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[1] = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[2] = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[3] = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_cmpge_ps( V1, V2 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorGreaterOrEqualR
+(
+ UINT* pCR,
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT ux, uy, uz, uw, CR;
+ XMVECTOR Control;
+
+ XMASSERT( pCR );
+
+ ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+ uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+ uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+ uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+ CR = 0;
+ if (ux&uy&uz&uw)
+ {
+ // All elements are greater
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!(ux|uy|uz|uw))
+ {
+ // All elements are not greater
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ *pCR = CR;
+ Control.vector4_u32[0] = ux;
+ Control.vector4_u32[1] = uy;
+ Control.vector4_u32[2] = uz;
+ Control.vector4_u32[3] = uw;
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( pCR );
+ XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
+ UINT CR = 0;
+ int iTest = _mm_movemask_ps(vTemp);
+ if (iTest==0xf)
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!iTest)
+ {
+ // All elements are not greater
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ *pCR = CR;
+ return vTemp;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorLess
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Control;
+ Control.vector4_u32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_cmplt_ps( V1, V2 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorLessOrEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Control;
+ Control.vector4_u32[0] = (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[1] = (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[2] = (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[3] = (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_cmple_ps( V1, V2 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorInBounds
+(
+ FXMVECTOR V,
+ FXMVECTOR Bounds
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Control;
+ Control.vector4_u32[0] = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[1] = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[2] = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0;
+ Control.vector4_u32[3] = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0;
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Test if less than or equal
+ XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
+ // Negate the bounds
+ XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
+ // Test if greater or equal (Reversed)
+ vTemp2 = _mm_cmple_ps(vTemp2,V);
+ // Blend answers
+ vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+ return vTemp1;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorInBoundsR
+(
+ UINT* pCR,
+ FXMVECTOR V,
+ FXMVECTOR Bounds
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT ux, uy, uz, uw, CR;
+ XMVECTOR Control;
+
+ XMASSERT( pCR != 0 );
+
+ ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+ uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+ uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+ uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+
+ CR = 0;
+
+ if (ux&uy&uz&uw)
+ {
+ // All elements are in bounds
+ CR = XM_CRMASK_CR6BOUNDS;
+ }
+ *pCR = CR;
+ Control.vector4_u32[0] = ux;
+ Control.vector4_u32[1] = uy;
+ Control.vector4_u32[2] = uz;
+ Control.vector4_u32[3] = uw;
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT( pCR != 0 );
+ // Test if less than or equal
+ XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
+ // Negate the bounds
+ XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
+ // Test if greater or equal (Reversed)
+ vTemp2 = _mm_cmple_ps(vTemp2,V);
+ // Blend answers
+ vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+
+ UINT CR = 0;
+ if (_mm_movemask_ps(vTemp1)==0xf) {
+ // All elements are in bounds
+ CR = XM_CRMASK_CR6BOUNDS;
+ }
+ *pCR = CR;
+ return vTemp1;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorIsNaN
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Control;
+ Control.vector4_u32[0] = XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+ Control.vector4_u32[1] = XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+ Control.vector4_u32[2] = XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+ Control.vector4_u32[3] = XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Mask off the exponent
+ __m128i vTempInf = _mm_and_si128(reinterpret_cast<const __m128i *>(&V)[0],g_XMInfinity);
+ // Mask off the mantissa
+ __m128i vTempNan = _mm_and_si128(reinterpret_cast<const __m128i *>(&V)[0],g_XMQNaNTest);
+ // Are any of the exponents == 0x7F800000?
+ vTempInf = _mm_cmpeq_epi32(vTempInf,g_XMInfinity);
+ // Are any of the mantissa's zero? (SSE2 doesn't have a neq test)
+ vTempNan = _mm_cmpeq_epi32(vTempNan,g_XMZero);
+ // Perform a not on the NaN test to be true on NON-zero mantissas
+ vTempNan = _mm_andnot_si128(vTempNan,vTempInf);
+ // If any are NaN, the signs are true after the merge above
+ return reinterpret_cast<const XMVECTOR *>(&vTempNan)[0];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorIsInfinite
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Control;
+ Control.vector4_u32[0] = XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+ Control.vector4_u32[1] = XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+ Control.vector4_u32[2] = XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+ Control.vector4_u32[3] = XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+ return Control;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Mask off the sign bit
+ __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
+ // Compare to infinity
+ vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
+ // If any are infinity, the signs are true.
+ return vTemp;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Rounding and clamping operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorMin
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+ Result.vector4_f32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0];
+ Result.vector4_f32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1];
+ Result.vector4_f32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2];
+ Result.vector4_f32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3];
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_min_ps( V1, V2 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorMax
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+ Result.vector4_f32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0];
+ Result.vector4_f32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1];
+ Result.vector4_f32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2];
+ Result.vector4_f32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3];
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_max_ps( V1, V2 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorRound
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+ XMVECTOR Bias;
+ CONST XMVECTOR Zero = XMVectorZero();
+ CONST XMVECTOR BiasPos = XMVectorReplicate(0.5f);
+ CONST XMVECTOR BiasNeg = XMVectorReplicate(-0.5f);
+
+ Bias = XMVectorLess(V, Zero);
+ Bias = XMVectorSelect(BiasPos, BiasNeg, Bias);
+ Result = XMVectorAdd(V, Bias);
+ Result = XMVectorTruncate(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // To handle NAN, INF and numbers greater than 8388608, use masking
+ // Get the abs value
+ __m128i vTest = _mm_and_si128(reinterpret_cast<const __m128i *>(&V)[0],g_XMAbsMask);
+ // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
+ vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
+ // Convert to int and back to float for rounding
+ __m128i vInt = _mm_cvtps_epi32(V);
+ // Convert back to floats
+ XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
+ // All numbers less than 8388608 will use the round to int
+ vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
+ // All others, use the ORIGINAL value
+ vTest = _mm_andnot_si128(vTest,reinterpret_cast<const __m128i *>(&V)[0]);
+ vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorTruncate
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR Result;
+ UINT i;
+
+ // Avoid C4701
+ Result.vector4_f32[0] = 0.0f;
+
+ for (i = 0; i < 4; i++)
+ {
+ if (XMISNAN(V.vector4_f32[i]))
+ {
+ Result.vector4_u32[i] = 0x7FC00000;
+ }
+ else if (fabsf(V.vector4_f32[i]) < 8388608.0f)
+ {
+ Result.vector4_f32[i] = (FLOAT)((INT)V.vector4_f32[i]);
+ }
+ else
+ {
+ Result.vector4_f32[i] = V.vector4_f32[i];
+ }
+ }
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // To handle NAN, INF and numbers greater than 8388608, use masking
+ // Get the abs value
+ __m128i vTest = _mm_and_si128(reinterpret_cast<const __m128i *>(&V)[0],g_XMAbsMask);
+ // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
+ vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
+ // Convert to int and back to float for rounding with truncation
+ __m128i vInt = _mm_cvttps_epi32(V);
+ // Convert back to floats
+ XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
+ // All numbers less than 8388608 will use the round to int
+ vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
+ // All others, use the ORIGINAL value
+ vTest = _mm_andnot_si128(vTest,reinterpret_cast<const __m128i *>(&V)[0]);
+ vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorFloor
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR vResult = {
+ floorf(V.vector4_f32[0]),
+ floorf(V.vector4_f32[1]),
+ floorf(V.vector4_f32[2]),
+ floorf(V.vector4_f32[3])
+ };
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // To handle NAN, INF and numbers greater than 8388608, use masking
+ // Get the abs value
+ __m128i vTest = _mm_and_si128(reinterpret_cast<const __m128i *>(&V)[0],g_XMAbsMask);
+ // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
+ vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
+ // Convert to int and back to float for rounding
+ XMVECTOR vResult = _mm_sub_ps(V,g_XMOneHalfMinusEpsilon);
+ __m128i vInt = _mm_cvtps_epi32(vResult);
+ // Convert back to floats
+ vResult = _mm_cvtepi32_ps(vInt);
+ // All numbers less than 8388608 will use the round to int
+ vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
+ // All others, use the ORIGINAL value
+ vTest = _mm_andnot_si128(vTest,reinterpret_cast<const __m128i *>(&V)[0]);
+ vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorCeiling
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult = {
+ ceilf(V.vector4_f32[0]),
+ ceilf(V.vector4_f32[1]),
+ ceilf(V.vector4_f32[2]),
+ ceilf(V.vector4_f32[3])
+ };
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // To handle NAN, INF and numbers greater than 8388608, use masking
+ // Get the abs value
+ __m128i vTest = _mm_and_si128(reinterpret_cast<const __m128i *>(&V)[0],g_XMAbsMask);
+ // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
+ vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
+ // Convert to int and back to float for rounding
+ XMVECTOR vResult = _mm_add_ps(V,g_XMOneHalfMinusEpsilon);
+ __m128i vInt = _mm_cvtps_epi32(vResult);
+ // Convert back to floats
+ vResult = _mm_cvtepi32_ps(vInt);
+ // All numbers less than 8388608 will use the round to int
+ vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
+ // All others, use the ORIGINAL value
+ vTest = _mm_andnot_si128(vTest,reinterpret_cast<const __m128i *>(&V)[0]);
+ vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorClamp
+(
+ FXMVECTOR V,
+ FXMVECTOR Min,
+ FXMVECTOR Max
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ XMASSERT(XMVector4LessOrEqual(Min, Max));
+
+ Result = XMVectorMax(Min, V);
+ Result = XMVectorMin(Max, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult;
+ XMASSERT(XMVector4LessOrEqual(Min, Max));
+ vResult = _mm_max_ps(Min,V);
+ vResult = _mm_min_ps(vResult,Max);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorSaturate
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ CONST XMVECTOR Zero = XMVectorZero();
+
+ return XMVectorClamp(V, Zero, g_XMOne.v);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Set <0 to 0
+ XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
+ // Set>1 to 1
+ return _mm_min_ps(vResult,g_XMOne);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Bitwise logical operations
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorAndInt
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_u32[0] = V1.vector4_u32[0] & V2.vector4_u32[0];
+ Result.vector4_u32[1] = V1.vector4_u32[1] & V2.vector4_u32[1];
+ Result.vector4_u32[2] = V1.vector4_u32[2] & V2.vector4_u32[2];
+ Result.vector4_u32[3] = V1.vector4_u32[3] & V2.vector4_u32[3];
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_and_ps(V1,V2);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorAndCInt
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_u32[0] = V1.vector4_u32[0] & ~V2.vector4_u32[0];
+ Result.vector4_u32[1] = V1.vector4_u32[1] & ~V2.vector4_u32[1];
+ Result.vector4_u32[2] = V1.vector4_u32[2] & ~V2.vector4_u32[2];
+ Result.vector4_u32[3] = V1.vector4_u32[3] & ~V2.vector4_u32[3];
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i V = _mm_andnot_si128( reinterpret_cast<const __m128i *>(&V2)[0], reinterpret_cast<const __m128i *>(&V1)[0] );
+ return reinterpret_cast<__m128 *>(&V)[0];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorOrInt
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_u32[0] = V1.vector4_u32[0] | V2.vector4_u32[0];
+ Result.vector4_u32[1] = V1.vector4_u32[1] | V2.vector4_u32[1];
+ Result.vector4_u32[2] = V1.vector4_u32[2] | V2.vector4_u32[2];
+ Result.vector4_u32[3] = V1.vector4_u32[3] | V2.vector4_u32[3];
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i V = _mm_or_si128( reinterpret_cast<const __m128i *>(&V1)[0], reinterpret_cast<const __m128i *>(&V2)[0] );
+ return reinterpret_cast<__m128 *>(&V)[0];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorNorInt
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_u32[0] = ~(V1.vector4_u32[0] | V2.vector4_u32[0]);
+ Result.vector4_u32[1] = ~(V1.vector4_u32[1] | V2.vector4_u32[1]);
+ Result.vector4_u32[2] = ~(V1.vector4_u32[2] | V2.vector4_u32[2]);
+ Result.vector4_u32[3] = ~(V1.vector4_u32[3] | V2.vector4_u32[3]);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i Result;
+ Result = _mm_or_si128( reinterpret_cast<const __m128i *>(&V1)[0], reinterpret_cast<const __m128i *>(&V2)[0] );
+ Result = _mm_andnot_si128( Result,g_XMNegOneMask);
+ return reinterpret_cast<__m128 *>(&Result)[0];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorXorInt
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_u32[0] = V1.vector4_u32[0] ^ V2.vector4_u32[0];
+ Result.vector4_u32[1] = V1.vector4_u32[1] ^ V2.vector4_u32[1];
+ Result.vector4_u32[2] = V1.vector4_u32[2] ^ V2.vector4_u32[2];
+ Result.vector4_u32[3] = V1.vector4_u32[3] ^ V2.vector4_u32[3];
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i V = _mm_xor_si128( reinterpret_cast<const __m128i *>(&V1)[0], reinterpret_cast<const __m128i *>(&V2)[0] );
+ return reinterpret_cast<__m128 *>(&V)[0];
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorNegate
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_f32[0] = -V.vector4_f32[0];
+ Result.vector4_f32[1] = -V.vector4_f32[1];
+ Result.vector4_f32[2] = -V.vector4_f32[2];
+ Result.vector4_f32[3] = -V.vector4_f32[3];
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR Z;
+
+ Z = _mm_setzero_ps();
+
+ return _mm_sub_ps( Z, V );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorAdd
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_f32[0] = V1.vector4_f32[0] + V2.vector4_f32[0];
+ Result.vector4_f32[1] = V1.vector4_f32[1] + V2.vector4_f32[1];
+ Result.vector4_f32[2] = V1.vector4_f32[2] + V2.vector4_f32[2];
+ Result.vector4_f32[3] = V1.vector4_f32[3] + V2.vector4_f32[3];
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_add_ps( V1, V2 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorAddAngles
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Mask;
+ XMVECTOR Offset;
+ XMVECTOR Result;
+ CONST XMVECTOR Zero = XMVectorZero();
+
+ // Add the given angles together. If the range of V1 is such
+ // that -Pi <= V1 < Pi and the range of V2 is such that
+ // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
+ // will be -Pi <= Result < Pi.
+ Result = XMVectorAdd(V1, V2);
+
+ Mask = XMVectorLess(Result, g_XMNegativePi.v);
+ Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
+
+ Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
+ Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
+
+ Result = XMVectorAdd(Result, Offset);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Adjust the angles
+ XMVECTOR vResult = _mm_add_ps(V1,V2);
+ // Less than Pi?
+ XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi);
+ vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
+ // Add 2Pi to all entries less than -Pi
+ vResult = _mm_add_ps(vResult,vOffset);
+ // Greater than or equal to Pi?
+ vOffset = _mm_cmpge_ps(vResult,g_XMPi);
+ vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
+ // Sub 2Pi to all entries greater than Pi
+ vResult = _mm_sub_ps(vResult,vOffset);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorSubtract
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_f32[0] = V1.vector4_f32[0] - V2.vector4_f32[0];
+ Result.vector4_f32[1] = V1.vector4_f32[1] - V2.vector4_f32[1];
+ Result.vector4_f32[2] = V1.vector4_f32[2] - V2.vector4_f32[2];
+ Result.vector4_f32[3] = V1.vector4_f32[3] - V2.vector4_f32[3];
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_sub_ps( V1, V2 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorSubtractAngles
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Mask;
+ XMVECTOR Offset;
+ XMVECTOR Result;
+ CONST XMVECTOR Zero = XMVectorZero();
+
+ // Subtract the given angles. If the range of V1 is such
+ // that -Pi <= V1 < Pi and the range of V2 is such that
+ // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
+ // will be -Pi <= Result < Pi.
+ Result = XMVectorSubtract(V1, V2);
+
+ Mask = XMVectorLess(Result, g_XMNegativePi.v);
+ Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
+
+ Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
+ Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
+
+ Result = XMVectorAdd(Result, Offset);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Adjust the angles
+ XMVECTOR vResult = _mm_sub_ps(V1,V2);
+ // Less than Pi?
+ XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi);
+ vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
+ // Add 2Pi to all entries less than -Pi
+ vResult = _mm_add_ps(vResult,vOffset);
+ // Greater than or equal to Pi?
+ vOffset = _mm_cmpge_ps(vResult,g_XMPi);
+ vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
+ // Sub 2Pi to all entries greater than Pi
+ vResult = _mm_sub_ps(vResult,vOffset);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorMultiply
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR Result = {
+ V1.vector4_f32[0] * V2.vector4_f32[0],
+ V1.vector4_f32[1] * V2.vector4_f32[1],
+ V1.vector4_f32[2] * V2.vector4_f32[2],
+ V1.vector4_f32[3] * V2.vector4_f32[3]
+ };
+ return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_mul_ps( V1, V2 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorMultiplyAdd
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2,
+ FXMVECTOR V3
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult = {
+ (V1.vector4_f32[0] * V2.vector4_f32[0]) + V3.vector4_f32[0],
+ (V1.vector4_f32[1] * V2.vector4_f32[1]) + V3.vector4_f32[1],
+ (V1.vector4_f32[2] * V2.vector4_f32[2]) + V3.vector4_f32[2],
+ (V1.vector4_f32[3] * V2.vector4_f32[3]) + V3.vector4_f32[3]
+ };
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_mul_ps( V1, V2 );
+ return _mm_add_ps(vResult, V3 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorDivide
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR Result;
+ Result.vector4_f32[0] = V1.vector4_f32[0] / V2.vector4_f32[0];
+ Result.vector4_f32[1] = V1.vector4_f32[1] / V2.vector4_f32[1];
+ Result.vector4_f32[2] = V1.vector4_f32[2] / V2.vector4_f32[2];
+ Result.vector4_f32[3] = V1.vector4_f32[3] / V2.vector4_f32[3];
+ return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_div_ps( V1, V2 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorNegativeMultiplySubtract
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2,
+ FXMVECTOR V3
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR vResult = {
+ V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]),
+ V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]),
+ V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]),
+ V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3])
+ };
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR R = _mm_mul_ps( V1, V2 );
+ return _mm_sub_ps( V3, R );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorScale
+(
+ FXMVECTOR V,
+ FLOAT ScaleFactor
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult = {
+ V.vector4_f32[0] * ScaleFactor,
+ V.vector4_f32[1] * ScaleFactor,
+ V.vector4_f32[2] * ScaleFactor,
+ V.vector4_f32[3] * ScaleFactor
+ };
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_set_ps1(ScaleFactor);
+ return _mm_mul_ps(vResult,V);
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorReciprocalEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR Result;
+ UINT i;
+
+ // Avoid C4701
+ Result.vector4_f32[0] = 0.0f;
+
+ for (i = 0; i < 4; i++)
+ {
+ if (XMISNAN(V.vector4_f32[i]))
+ {
+ Result.vector4_u32[i] = 0x7FC00000;
+ }
+ else if (V.vector4_f32[i] == 0.0f || V.vector4_f32[i] == -0.0f)
+ {
+ Result.vector4_u32[i] = 0x7F800000 | (V.vector4_u32[i] & 0x80000000);
+ }
+ else
+ {
+ Result.vector4_f32[i] = 1.f / V.vector4_f32[i];
+ }
+ }
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_rcp_ps(V);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorReciprocal
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return XMVectorReciprocalEst(V);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_div_ps(g_XMOne,V);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Return an estimated square root
+XMFINLINE XMVECTOR XMVectorSqrtEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR Select;
+
+ // if (x == +Infinity) sqrt(x) = +Infinity
+ // if (x == +0.0f) sqrt(x) = +0.0f
+ // if (x == -0.0f) sqrt(x) = -0.0f
+ // if (x < 0.0f) sqrt(x) = QNaN
+
+ XMVECTOR Result = XMVectorReciprocalSqrtEst(V);
+ XMVECTOR Zero = XMVectorZero();
+ XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
+ XMVECTOR VEqualsZero = XMVectorEqual(V, Zero);
+ Result = XMVectorMultiply(V, Result);
+ Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
+ Result = XMVectorSelect(V, Result, Select);
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_sqrt_ps(V);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorSqrt
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Zero;
+ XMVECTOR VEqualsInfinity, VEqualsZero;
+ XMVECTOR Select;
+ XMVECTOR Result;
+
+ // if (x == +Infinity) sqrt(x) = +Infinity
+ // if (x == +0.0f) sqrt(x) = +0.0f
+ // if (x == -0.0f) sqrt(x) = -0.0f
+ // if (x < 0.0f) sqrt(x) = QNaN
+
+ Result = XMVectorReciprocalSqrt(V);
+ Zero = XMVectorZero();
+ VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
+ VEqualsZero = XMVectorEqual(V, Zero);
+ Result = XMVectorMultiply(V, Result);
+ Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
+ Result = XMVectorSelect(V, Result, Select);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_sqrt_ps(V);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorReciprocalSqrtEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ // if (x == +Infinity) rsqrt(x) = 0
+ // if (x == +0.0f) rsqrt(x) = +Infinity
+ // if (x == -0.0f) rsqrt(x) = -Infinity
+ // if (x < 0.0f) rsqrt(x) = QNaN
+
+ XMVECTOR Result;
+ UINT i;
+
+ // Avoid C4701
+ Result.vector4_f32[0] = 0.0f;
+
+ for (i = 0; i < 4; i++)
+ {
+ if (XMISNAN(V.vector4_f32[i]))
+ {
+ Result.vector4_u32[i] = 0x7FC00000;
+ }
+ else if (V.vector4_f32[i] == 0.0f || V.vector4_f32[i] == -0.0f)
+ {
+ Result.vector4_u32[i] = 0x7F800000 | (V.vector4_u32[i] & 0x80000000);
+ }
+ else if (V.vector4_f32[i] < 0.0f)
+ {
+ Result.vector4_u32[i] = 0x7FFFFFFF;
+ }
+ else if (XMISINF(V.vector4_f32[i]))
+ {
+ Result.vector4_f32[i] = 0.0f;
+ }
+ else
+ {
+ Result.vector4_f32[i] = 1.0f / sqrtf(V.vector4_f32[i]);
+ }
+ }
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ return _mm_rsqrt_ps(V);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorReciprocalSqrt
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ return XMVectorReciprocalSqrtEst(V);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_sqrt_ps(V);
+ vResult = _mm_div_ps(g_XMOne,vResult);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorExpEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+ Result.vector4_f32[0] = powf(2.0f, V.vector4_f32[0]);
+ Result.vector4_f32[1] = powf(2.0f, V.vector4_f32[1]);
+ Result.vector4_f32[2] = powf(2.0f, V.vector4_f32[2]);
+ Result.vector4_f32[3] = powf(2.0f, V.vector4_f32[3]);
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_setr_ps(
+ powf(2.0f,XMVectorGetX(V)),
+ powf(2.0f,XMVectorGetY(V)),
+ powf(2.0f,XMVectorGetZ(V)),
+ powf(2.0f,XMVectorGetW(V)));
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMVectorExp
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR E, S;
+ XMVECTOR R, R2, R3, R4;
+ XMVECTOR V0, V1;
+ XMVECTOR C0X, C0Y, C0Z, C0W;
+ XMVECTOR C1X, C1Y, C1Z, C1W;
+ XMVECTOR Result;
+ static CONST XMVECTOR C0 = {1.0f, -6.93147182e-1f, 2.40226462e-1f, -5.55036440e-2f};
+ static CONST XMVECTOR C1 = {9.61597636e-3f, -1.32823968e-3f, 1.47491097e-4f, -1.08635004e-5f};
+
+ R = XMVectorFloor(V);
+ E = XMVectorExpEst(R);
+ R = XMVectorSubtract(V, R);
+ R2 = XMVectorMultiply(R, R);
+ R3 = XMVectorMultiply(R, R2);
+ R4 = XMVectorMultiply(R2, R2);
+
+ C0X = XMVectorSplatX(C0);
+ C0Y = XMVectorSplatY(C0);
+ C0Z = XMVectorSplatZ(C0);
+ C0W = XMVectorSplatW(C0);
+
+ C1X = XMVectorSplatX(C1);
+ C1Y = XMVectorSplatY(C1);
+ C1Z = XMVectorSplatZ(C1);
+ C1W = XMVectorSplatW(C1);
+
+ V0 = XMVectorMultiplyAdd(R, C0Y, C0X);
+ V0 = XMVectorMultiplyAdd(R2, C0Z, V0);
+ V0 = XMVectorMultiplyAdd(R3, C0W, V0);
+
+ V1 = XMVectorMultiplyAdd(R, C1Y, C1X);
+ V1 = XMVectorMultiplyAdd(R2, C1Z, V1);
+ V1 = XMVectorMultiplyAdd(R3, C1W, V1);
+
+ S = XMVectorMultiplyAdd(R4, V1, V0);
+
+ S = XMVectorReciprocal(S);
+ Result = XMVectorMultiply(E, S);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static CONST XMVECTORF32 C0 = {1.0f, -6.93147182e-1f, 2.40226462e-1f, -5.55036440e-2f};
+ static CONST XMVECTORF32 C1 = {9.61597636e-3f, -1.32823968e-3f, 1.47491097e-4f, -1.08635004e-5f};
+
+ // Get the integer of the input
+ XMVECTOR R = XMVectorFloor(V);
+ // Get the exponent estimate
+ XMVECTOR E = XMVectorExpEst(R);
+ // Get the fractional only
+ R = _mm_sub_ps(V,R);
+ // Get R^2
+ XMVECTOR R2 = _mm_mul_ps(R,R);
+ // And R^3
+ XMVECTOR R3 = _mm_mul_ps(R,R2);
+
+ XMVECTOR V0 = _mm_load_ps1(&C0.f[1]);
+ V0 = _mm_mul_ps(V0,R);
+ XMVECTOR vConstants = _mm_load_ps1(&C0.f[0]);
+ V0 = _mm_add_ps(V0,vConstants);
+ vConstants = _mm_load_ps1(&C0.f[2]);
+ vConstants = _mm_mul_ps(vConstants,R2);
+ V0 = _mm_add_ps(V0,vConstants);
+ vConstants = _mm_load_ps1(&C0.f[3]);
+ vConstants = _mm_mul_ps(vConstants,R3);
+ V0 = _mm_add_ps(V0,vConstants);
+
+ XMVECTOR V1 = _mm_load_ps1(&C1.f[1]);
+ V1 = _mm_mul_ps(V1,R);
+ vConstants = _mm_load_ps1(&C1.f[0]);
+ V1 = _mm_add_ps(V1,vConstants);
+ vConstants = _mm_load_ps1(&C1.f[2]);
+ vConstants = _mm_mul_ps(vConstants,R2);
+ V1 = _mm_add_ps(V1,vConstants);
+ vConstants = _mm_load_ps1(&C1.f[3]);
+ vConstants = _mm_mul_ps(vConstants,R3);
+ V1 = _mm_add_ps(V1,vConstants);
+ // R2 = R^4
+ R2 = _mm_mul_ps(R2,R2);
+ R2 = _mm_mul_ps(R2,V1);
+ R2 = _mm_add_ps(R2,V0);
+ E = _mm_div_ps(E,R2);
+ return E;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorLogEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ FLOAT fScale = (1.0f / logf(2.0f));
+ XMVECTOR Result;
+
+ Result.vector4_f32[0] = logf(V.vector4_f32[0])*fScale;
+ Result.vector4_f32[1] = logf(V.vector4_f32[1])*fScale;
+ Result.vector4_f32[2] = logf(V.vector4_f32[2])*fScale;
+ Result.vector4_f32[3] = logf(V.vector4_f32[3])*fScale;
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vScale = _mm_set_ps1(1.0f / logf(2.0f));
+ XMVECTOR vResult = _mm_setr_ps(
+ logf(XMVectorGetX(V)),
+ logf(XMVectorGetY(V)),
+ logf(XMVectorGetZ(V)),
+ logf(XMVectorGetW(V)));
+ vResult = _mm_mul_ps(vResult,vScale);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMVectorLog
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ FLOAT fScale = (1.0f / logf(2.0f));
+ XMVECTOR Result;
+
+ Result.vector4_f32[0] = logf(V.vector4_f32[0])*fScale;
+ Result.vector4_f32[1] = logf(V.vector4_f32[1])*fScale;
+ Result.vector4_f32[2] = logf(V.vector4_f32[2])*fScale;
+ Result.vector4_f32[3] = logf(V.vector4_f32[3])*fScale;
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vScale = _mm_set_ps1(1.0f / logf(2.0f));
+ XMVECTOR vResult = _mm_setr_ps(
+ logf(XMVectorGetX(V)),
+ logf(XMVectorGetY(V)),
+ logf(XMVectorGetZ(V)),
+ logf(XMVectorGetW(V)));
+ vResult = _mm_mul_ps(vResult,vScale);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorPowEst
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_f32[0] = powf(V1.vector4_f32[0], V2.vector4_f32[0]);
+ Result.vector4_f32[1] = powf(V1.vector4_f32[1], V2.vector4_f32[1]);
+ Result.vector4_f32[2] = powf(V1.vector4_f32[2], V2.vector4_f32[2]);
+ Result.vector4_f32[3] = powf(V1.vector4_f32[3], V2.vector4_f32[3]);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_setr_ps(
+ powf(XMVectorGetX(V1),XMVectorGetX(V2)),
+ powf(XMVectorGetY(V1),XMVectorGetY(V2)),
+ powf(XMVectorGetZ(V1),XMVectorGetZ(V2)),
+ powf(XMVectorGetW(V1),XMVectorGetW(V2)));
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorPow
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
+
+ return XMVectorPowEst(V1, V2);
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorAbs
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult = {
+ fabsf(V.vector4_f32[0]),
+ fabsf(V.vector4_f32[1]),
+ fabsf(V.vector4_f32[2]),
+ fabsf(V.vector4_f32[3])
+ };
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_setzero_ps();
+ vResult = _mm_sub_ps(vResult,V);
+ vResult = _mm_max_ps(vResult,V);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorMod
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Reciprocal;
+ XMVECTOR Quotient;
+ XMVECTOR Result;
+
+ // V1 % V2 = V1 - V2 * truncate(V1 / V2)
+ Reciprocal = XMVectorReciprocal(V2);
+ Quotient = XMVectorMultiply(V1, Reciprocal);
+ Quotient = XMVectorTruncate(Quotient);
+ Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_div_ps(V1, V2);
+ vResult = XMVectorTruncate(vResult);
+ vResult = _mm_mul_ps(vResult,V2);
+ vResult = _mm_sub_ps(V1,vResult);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorModAngles
+(
+ FXMVECTOR Angles
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ XMVECTOR Result;
+
+ // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
+ V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v);
+ V = XMVectorRound(V);
+ Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
+ XMVECTOR vResult = _mm_mul_ps(Angles,g_XMReciprocalTwoPi);
+ // Use the inline function due to complexity for rounding
+ vResult = XMVectorRound(vResult);
+ vResult = _mm_mul_ps(vResult,g_XMTwoPi);
+ vResult = _mm_sub_ps(Angles,vResult);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMVectorSin
+(
+ FXMVECTOR V
+)
+{
+
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V1, V2, V3, V5, V7, V9, V11, V13, V15, V17, V19, V21, V23;
+ XMVECTOR S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11;
+ XMVECTOR Result;
+
+ V1 = XMVectorModAngles(V);
+
+ // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+ // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+ V2 = XMVectorMultiply(V1, V1);
+ V3 = XMVectorMultiply(V2, V1);
+ V5 = XMVectorMultiply(V3, V2);
+ V7 = XMVectorMultiply(V5, V2);
+ V9 = XMVectorMultiply(V7, V2);
+ V11 = XMVectorMultiply(V9, V2);
+ V13 = XMVectorMultiply(V11, V2);
+ V15 = XMVectorMultiply(V13, V2);
+ V17 = XMVectorMultiply(V15, V2);
+ V19 = XMVectorMultiply(V17, V2);
+ V21 = XMVectorMultiply(V19, V2);
+ V23 = XMVectorMultiply(V21, V2);
+
+ S1 = XMVectorSplatY(g_XMSinCoefficients0.v);
+ S2 = XMVectorSplatZ(g_XMSinCoefficients0.v);
+ S3 = XMVectorSplatW(g_XMSinCoefficients0.v);
+ S4 = XMVectorSplatX(g_XMSinCoefficients1.v);
+ S5 = XMVectorSplatY(g_XMSinCoefficients1.v);
+ S6 = XMVectorSplatZ(g_XMSinCoefficients1.v);
+ S7 = XMVectorSplatW(g_XMSinCoefficients1.v);
+ S8 = XMVectorSplatX(g_XMSinCoefficients2.v);
+ S9 = XMVectorSplatY(g_XMSinCoefficients2.v);
+ S10 = XMVectorSplatZ(g_XMSinCoefficients2.v);
+ S11 = XMVectorSplatW(g_XMSinCoefficients2.v);
+
+ Result = XMVectorMultiplyAdd(S1, V3, V1);
+ Result = XMVectorMultiplyAdd(S2, V5, Result);
+ Result = XMVectorMultiplyAdd(S3, V7, Result);
+ Result = XMVectorMultiplyAdd(S4, V9, Result);
+ Result = XMVectorMultiplyAdd(S5, V11, Result);
+ Result = XMVectorMultiplyAdd(S6, V13, Result);
+ Result = XMVectorMultiplyAdd(S7, V15, Result);
+ Result = XMVectorMultiplyAdd(S8, V17, Result);
+ Result = XMVectorMultiplyAdd(S9, V19, Result);
+ Result = XMVectorMultiplyAdd(S10, V21, Result);
+ Result = XMVectorMultiplyAdd(S11, V23, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Force the value within the bounds of pi
+ XMVECTOR vResult = XMVectorModAngles(V);
+ // Each on is V to the "num" power
+ // V2 = V1^2
+ XMVECTOR V2 = _mm_mul_ps(vResult,vResult);
+ // V1^3
+ XMVECTOR vPower = _mm_mul_ps(vResult,V2);
+ XMVECTOR vConstants = _mm_load_ps1(&g_XMSinCoefficients0.f[1]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^5
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMSinCoefficients0.f[2]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^7
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMSinCoefficients0.f[3]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^9
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMSinCoefficients1.f[0]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^11
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMSinCoefficients1.f[1]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^13
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMSinCoefficients1.f[2]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^15
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMSinCoefficients1.f[3]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^17
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMSinCoefficients2.f[0]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^19
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMSinCoefficients2.f[1]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^21
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMSinCoefficients2.f[2]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^23
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMSinCoefficients2.f[3]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMVectorCos
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V1, V2, V4, V6, V8, V10, V12, V14, V16, V18, V20, V22;
+ XMVECTOR C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
+ XMVECTOR Result;
+
+ V1 = XMVectorModAngles(V);
+
+ // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+ // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+ V2 = XMVectorMultiply(V1, V1);
+ V4 = XMVectorMultiply(V2, V2);
+ V6 = XMVectorMultiply(V4, V2);
+ V8 = XMVectorMultiply(V4, V4);
+ V10 = XMVectorMultiply(V6, V4);
+ V12 = XMVectorMultiply(V6, V6);
+ V14 = XMVectorMultiply(V8, V6);
+ V16 = XMVectorMultiply(V8, V8);
+ V18 = XMVectorMultiply(V10, V8);
+ V20 = XMVectorMultiply(V10, V10);
+ V22 = XMVectorMultiply(V12, V10);
+
+ C1 = XMVectorSplatY(g_XMCosCoefficients0.v);
+ C2 = XMVectorSplatZ(g_XMCosCoefficients0.v);
+ C3 = XMVectorSplatW(g_XMCosCoefficients0.v);
+ C4 = XMVectorSplatX(g_XMCosCoefficients1.v);
+ C5 = XMVectorSplatY(g_XMCosCoefficients1.v);
+ C6 = XMVectorSplatZ(g_XMCosCoefficients1.v);
+ C7 = XMVectorSplatW(g_XMCosCoefficients1.v);
+ C8 = XMVectorSplatX(g_XMCosCoefficients2.v);
+ C9 = XMVectorSplatY(g_XMCosCoefficients2.v);
+ C10 = XMVectorSplatZ(g_XMCosCoefficients2.v);
+ C11 = XMVectorSplatW(g_XMCosCoefficients2.v);
+
+ Result = XMVectorMultiplyAdd(C1, V2, g_XMOne.v);
+ Result = XMVectorMultiplyAdd(C2, V4, Result);
+ Result = XMVectorMultiplyAdd(C3, V6, Result);
+ Result = XMVectorMultiplyAdd(C4, V8, Result);
+ Result = XMVectorMultiplyAdd(C5, V10, Result);
+ Result = XMVectorMultiplyAdd(C6, V12, Result);
+ Result = XMVectorMultiplyAdd(C7, V14, Result);
+ Result = XMVectorMultiplyAdd(C8, V16, Result);
+ Result = XMVectorMultiplyAdd(C9, V18, Result);
+ Result = XMVectorMultiplyAdd(C10, V20, Result);
+ Result = XMVectorMultiplyAdd(C11, V22, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Force the value within the bounds of pi
+ XMVECTOR V2 = XMVectorModAngles(V);
+ // Each on is V to the "num" power
+ // V2 = V1^2
+ V2 = _mm_mul_ps(V2,V2);
+ // V^2
+ XMVECTOR vConstants = _mm_load_ps1(&g_XMCosCoefficients0.f[1]);
+ vConstants = _mm_mul_ps(vConstants,V2);
+ XMVECTOR vResult = _mm_add_ps(vConstants,g_XMOne);
+
+ // V^4
+ XMVECTOR vPower = _mm_mul_ps(V2,V2);
+ vConstants = _mm_load_ps1(&g_XMCosCoefficients0.f[2]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^6
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMCosCoefficients0.f[3]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^8
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMCosCoefficients1.f[0]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^10
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMCosCoefficients1.f[1]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^12
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMCosCoefficients1.f[2]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^14
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMCosCoefficients1.f[3]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^16
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMCosCoefficients2.f[0]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^18
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMCosCoefficients2.f[1]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^20
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMCosCoefficients2.f[2]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ // V^22
+ vPower = _mm_mul_ps(vPower,V2);
+ vConstants = _mm_load_ps1(&g_XMCosCoefficients2.f[3]);
+ vConstants = _mm_mul_ps(vConstants,vPower);
+ vResult = _mm_add_ps(vResult,vConstants);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE VOID XMVectorSinCos
+(
+ XMVECTOR* pSin,
+ XMVECTOR* pCos,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13;
+ XMVECTOR V14, V15, V16, V17, V18, V19, V20, V21, V22, V23;
+ XMVECTOR S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11;
+ XMVECTOR C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
+ XMVECTOR Sin, Cos;
+
+ XMASSERT(pSin);
+ XMASSERT(pCos);
+
+ V1 = XMVectorModAngles(V);
+
+ // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+ // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+ // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+ // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+
+ V2 = XMVectorMultiply(V1, V1);
+ V3 = XMVectorMultiply(V2, V1);
+ V4 = XMVectorMultiply(V2, V2);
+ V5 = XMVectorMultiply(V3, V2);
+ V6 = XMVectorMultiply(V3, V3);
+ V7 = XMVectorMultiply(V4, V3);
+ V8 = XMVectorMultiply(V4, V4);
+ V9 = XMVectorMultiply(V5, V4);
+ V10 = XMVectorMultiply(V5, V5);
+ V11 = XMVectorMultiply(V6, V5);
+ V12 = XMVectorMultiply(V6, V6);
+ V13 = XMVectorMultiply(V7, V6);
+ V14 = XMVectorMultiply(V7, V7);
+ V15 = XMVectorMultiply(V8, V7);
+ V16 = XMVectorMultiply(V8, V8);
+ V17 = XMVectorMultiply(V9, V8);
+ V18 = XMVectorMultiply(V9, V9);
+ V19 = XMVectorMultiply(V10, V9);
+ V20 = XMVectorMultiply(V10, V10);
+ V21 = XMVectorMultiply(V11, V10);
+ V22 = XMVectorMultiply(V11, V11);
+ V23 = XMVectorMultiply(V12, V11);
+
+ S1 = XMVectorSplatY(g_XMSinCoefficients0.v);
+ S2 = XMVectorSplatZ(g_XMSinCoefficients0.v);
+ S3 = XMVectorSplatW(g_XMSinCoefficients0.v);
+ S4 = XMVectorSplatX(g_XMSinCoefficients1.v);
+ S5 = XMVectorSplatY(g_XMSinCoefficients1.v);
+ S6 = XMVectorSplatZ(g_XMSinCoefficients1.v);
+ S7 = XMVectorSplatW(g_XMSinCoefficients1.v);
+ S8 = XMVectorSplatX(g_XMSinCoefficients2.v);
+ S9 = XMVectorSplatY(g_XMSinCoefficients2.v);
+ S10 = XMVectorSplatZ(g_XMSinCoefficients2.v);
+ S11 = XMVectorSplatW(g_XMSinCoefficients2.v);
+
+ C1 = XMVectorSplatY(g_XMCosCoefficients0.v);
+ C2 = XMVectorSplatZ(g_XMCosCoefficients0.v);
+ C3 = XMVectorSplatW(g_XMCosCoefficients0.v);
+ C4 = XMVectorSplatX(g_XMCosCoefficients1.v);
+ C5 = XMVectorSplatY(g_XMCosCoefficients1.v);
+ C6 = XMVectorSplatZ(g_XMCosCoefficients1.v);
+ C7 = XMVectorSplatW(g_XMCosCoefficients1.v);
+ C8 = XMVectorSplatX(g_XMCosCoefficients2.v);
+ C9 = XMVectorSplatY(g_XMCosCoefficients2.v);
+ C10 = XMVectorSplatZ(g_XMCosCoefficients2.v);
+ C11 = XMVectorSplatW(g_XMCosCoefficients2.v);
+
+ Sin = XMVectorMultiplyAdd(S1, V3, V1);
+ Sin = XMVectorMultiplyAdd(S2, V5, Sin);
+ Sin = XMVectorMultiplyAdd(S3, V7, Sin);
+ Sin = XMVectorMultiplyAdd(S4, V9, Sin);
+ Sin = XMVectorMultiplyAdd(S5, V11, Sin);
+ Sin = XMVectorMultiplyAdd(S6, V13, Sin);
+ Sin = XMVectorMultiplyAdd(S7, V15, Sin);
+ Sin = XMVectorMultiplyAdd(S8, V17, Sin);
+ Sin = XMVectorMultiplyAdd(S9, V19, Sin);
+ Sin = XMVectorMultiplyAdd(S10, V21, Sin);
+ Sin = XMVectorMultiplyAdd(S11, V23, Sin);
+
+ Cos = XMVectorMultiplyAdd(C1, V2, g_XMOne.v);
+ Cos = XMVectorMultiplyAdd(C2, V4, Cos);
+ Cos = XMVectorMultiplyAdd(C3, V6, Cos);
+ Cos = XMVectorMultiplyAdd(C4, V8, Cos);
+ Cos = XMVectorMultiplyAdd(C5, V10, Cos);
+ Cos = XMVectorMultiplyAdd(C6, V12, Cos);
+ Cos = XMVectorMultiplyAdd(C7, V14, Cos);
+ Cos = XMVectorMultiplyAdd(C8, V16, Cos);
+ Cos = XMVectorMultiplyAdd(C9, V18, Cos);
+ Cos = XMVectorMultiplyAdd(C10, V20, Cos);
+ Cos = XMVectorMultiplyAdd(C11, V22, Cos);
+
+ *pSin = Sin;
+ *pCos = Cos;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSin);
+ XMASSERT(pCos);
+ XMVECTOR V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13;
+ XMVECTOR V14, V15, V16, V17, V18, V19, V20, V21, V22, V23;
+ XMVECTOR S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11;
+ XMVECTOR C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
+ XMVECTOR Sin, Cos;
+
+ V1 = XMVectorModAngles(V);
+
+ // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+ // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+ // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+ // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+
+ V2 = XMVectorMultiply(V1, V1);
+ V3 = XMVectorMultiply(V2, V1);
+ V4 = XMVectorMultiply(V2, V2);
+ V5 = XMVectorMultiply(V3, V2);
+ V6 = XMVectorMultiply(V3, V3);
+ V7 = XMVectorMultiply(V4, V3);
+ V8 = XMVectorMultiply(V4, V4);
+ V9 = XMVectorMultiply(V5, V4);
+ V10 = XMVectorMultiply(V5, V5);
+ V11 = XMVectorMultiply(V6, V5);
+ V12 = XMVectorMultiply(V6, V6);
+ V13 = XMVectorMultiply(V7, V6);
+ V14 = XMVectorMultiply(V7, V7);
+ V15 = XMVectorMultiply(V8, V7);
+ V16 = XMVectorMultiply(V8, V8);
+ V17 = XMVectorMultiply(V9, V8);
+ V18 = XMVectorMultiply(V9, V9);
+ V19 = XMVectorMultiply(V10, V9);
+ V20 = XMVectorMultiply(V10, V10);
+ V21 = XMVectorMultiply(V11, V10);
+ V22 = XMVectorMultiply(V11, V11);
+ V23 = XMVectorMultiply(V12, V11);
+
+ S1 = _mm_load_ps1(&g_XMSinCoefficients0.f[1]);
+ S2 = _mm_load_ps1(&g_XMSinCoefficients0.f[2]);
+ S3 = _mm_load_ps1(&g_XMSinCoefficients0.f[3]);
+ S4 = _mm_load_ps1(&g_XMSinCoefficients1.f[0]);
+ S5 = _mm_load_ps1(&g_XMSinCoefficients1.f[1]);
+ S6 = _mm_load_ps1(&g_XMSinCoefficients1.f[2]);
+ S7 = _mm_load_ps1(&g_XMSinCoefficients1.f[3]);
+ S8 = _mm_load_ps1(&g_XMSinCoefficients2.f[0]);
+ S9 = _mm_load_ps1(&g_XMSinCoefficients2.f[1]);
+ S10 = _mm_load_ps1(&g_XMSinCoefficients2.f[2]);
+ S11 = _mm_load_ps1(&g_XMSinCoefficients2.f[3]);
+
+ C1 = _mm_load_ps1(&g_XMCosCoefficients0.f[1]);
+ C2 = _mm_load_ps1(&g_XMCosCoefficients0.f[2]);
+ C3 = _mm_load_ps1(&g_XMCosCoefficients0.f[3]);
+ C4 = _mm_load_ps1(&g_XMCosCoefficients1.f[0]);
+ C5 = _mm_load_ps1(&g_XMCosCoefficients1.f[1]);
+ C6 = _mm_load_ps1(&g_XMCosCoefficients1.f[2]);
+ C7 = _mm_load_ps1(&g_XMCosCoefficients1.f[3]);
+ C8 = _mm_load_ps1(&g_XMCosCoefficients2.f[0]);
+ C9 = _mm_load_ps1(&g_XMCosCoefficients2.f[1]);
+ C10 = _mm_load_ps1(&g_XMCosCoefficients2.f[2]);
+ C11 = _mm_load_ps1(&g_XMCosCoefficients2.f[3]);
+
+ S1 = _mm_mul_ps(S1,V3);
+ Sin = _mm_add_ps(S1,V1);
+ Sin = XMVectorMultiplyAdd(S2, V5, Sin);
+ Sin = XMVectorMultiplyAdd(S3, V7, Sin);
+ Sin = XMVectorMultiplyAdd(S4, V9, Sin);
+ Sin = XMVectorMultiplyAdd(S5, V11, Sin);
+ Sin = XMVectorMultiplyAdd(S6, V13, Sin);
+ Sin = XMVectorMultiplyAdd(S7, V15, Sin);
+ Sin = XMVectorMultiplyAdd(S8, V17, Sin);
+ Sin = XMVectorMultiplyAdd(S9, V19, Sin);
+ Sin = XMVectorMultiplyAdd(S10, V21, Sin);
+ Sin = XMVectorMultiplyAdd(S11, V23, Sin);
+
+ Cos = _mm_mul_ps(C1,V2);
+ Cos = _mm_add_ps(Cos,g_XMOne);
+ Cos = XMVectorMultiplyAdd(C2, V4, Cos);
+ Cos = XMVectorMultiplyAdd(C3, V6, Cos);
+ Cos = XMVectorMultiplyAdd(C4, V8, Cos);
+ Cos = XMVectorMultiplyAdd(C5, V10, Cos);
+ Cos = XMVectorMultiplyAdd(C6, V12, Cos);
+ Cos = XMVectorMultiplyAdd(C7, V14, Cos);
+ Cos = XMVectorMultiplyAdd(C8, V16, Cos);
+ Cos = XMVectorMultiplyAdd(C9, V18, Cos);
+ Cos = XMVectorMultiplyAdd(C10, V20, Cos);
+ Cos = XMVectorMultiplyAdd(C11, V22, Cos);
+
+ *pSin = Sin;
+ *pCos = Cos;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMVectorTan
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ // Cody and Waite algorithm to compute tangent.
+
+ XMVECTOR VA, VB, VC, VC2;
+ XMVECTOR T0, T1, T2, T3, T4, T5, T6, T7;
+ XMVECTOR C0, C1, TwoDivPi, Epsilon;
+ XMVECTOR N, D;
+ XMVECTOR R0, R1;
+ XMVECTOR VIsZero, VCNearZero, VBIsEven;
+ XMVECTOR Zero;
+ XMVECTOR Result;
+ UINT i;
+ static CONST XMVECTOR TanCoefficients0 = {1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f};
+ static CONST XMVECTOR TanCoefficients1 = {4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f};
+ static CONST XMVECTOR TanConstants = {1.570796371f, 6.077100628e-11f, 0.000244140625f, 2.0f / XM_PI};
+ static CONST XMVECTORU32 Mask = {0x1, 0x1, 0x1, 0x1};
+
+ TwoDivPi = XMVectorSplatW(TanConstants);
+
+ Zero = XMVectorZero();
+
+ C0 = XMVectorSplatX(TanConstants);
+ C1 = XMVectorSplatY(TanConstants);
+ Epsilon = XMVectorSplatZ(TanConstants);
+
+ VA = XMVectorMultiply(V, TwoDivPi);
+
+ VA = XMVectorRound(VA);
+
+ VC = XMVectorNegativeMultiplySubtract(VA, C0, V);
+
+ VB = XMVectorAbs(VA);
+
+ VC = XMVectorNegativeMultiplySubtract(VA, C1, VC);
+
+ for (i = 0; i < 4; i++)
+ {
+ VB.vector4_u32[i] = (UINT)VB.vector4_f32[i];
+ }
+
+ VC2 = XMVectorMultiply(VC, VC);
+
+ T7 = XMVectorSplatW(TanCoefficients1);
+ T6 = XMVectorSplatZ(TanCoefficients1);
+ T4 = XMVectorSplatX(TanCoefficients1);
+ T3 = XMVectorSplatW(TanCoefficients0);
+ T5 = XMVectorSplatY(TanCoefficients1);
+ T2 = XMVectorSplatZ(TanCoefficients0);
+ T1 = XMVectorSplatY(TanCoefficients0);
+ T0 = XMVectorSplatX(TanCoefficients0);
+
+ VBIsEven = XMVectorAndInt(VB, Mask.v);
+ VBIsEven = XMVectorEqualInt(VBIsEven, Zero);
+
+ N = XMVectorMultiplyAdd(VC2, T7, T6);
+ D = XMVectorMultiplyAdd(VC2, T4, T3);
+ N = XMVectorMultiplyAdd(VC2, N, T5);
+ D = XMVectorMultiplyAdd(VC2, D, T2);
+ N = XMVectorMultiply(VC2, N);
+ D = XMVectorMultiplyAdd(VC2, D, T1);
+ N = XMVectorMultiplyAdd(VC, N, VC);
+ VCNearZero = XMVectorInBounds(VC, Epsilon);
+ D = XMVectorMultiplyAdd(VC2, D, T0);
+
+ N = XMVectorSelect(N, VC, VCNearZero);
+ D = XMVectorSelect(D, g_XMOne.v, VCNearZero);
+
+ R0 = XMVectorNegate(N);
+ R1 = XMVectorReciprocal(D);
+ R0 = XMVectorReciprocal(R0);
+ R1 = XMVectorMultiply(N, R1);
+ R0 = XMVectorMultiply(D, R0);
+
+ VIsZero = XMVectorEqual(V, Zero);
+
+ Result = XMVectorSelect(R0, R1, VBIsEven);
+
+ Result = XMVectorSelect(Result, Zero, VIsZero);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Cody and Waite algorithm to compute tangent.
+
+ XMVECTOR VA, VB, VC, VC2;
+ XMVECTOR T0, T1, T2, T3, T4, T5, T6, T7;
+ XMVECTOR C0, C1, TwoDivPi, Epsilon;
+ XMVECTOR N, D;
+ XMVECTOR R0, R1;
+ XMVECTOR VIsZero, VCNearZero, VBIsEven;
+ XMVECTOR Zero;
+ XMVECTOR Result;
+ static CONST XMVECTORF32 TanCoefficients0 = {1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f};
+ static CONST XMVECTORF32 TanCoefficients1 = {4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f};
+ static CONST XMVECTORF32 TanConstants = {1.570796371f, 6.077100628e-11f, 0.000244140625f, 2.0f / XM_PI};
+ static CONST XMVECTORI32 Mask = {0x1, 0x1, 0x1, 0x1};
+
+ TwoDivPi = XMVectorSplatW(TanConstants);
+
+ Zero = XMVectorZero();
+
+ C0 = XMVectorSplatX(TanConstants);
+ C1 = XMVectorSplatY(TanConstants);
+ Epsilon = XMVectorSplatZ(TanConstants);
+
+ VA = XMVectorMultiply(V, TwoDivPi);
+
+ VA = XMVectorRound(VA);
+
+ VC = XMVectorNegativeMultiplySubtract(VA, C0, V);
+
+ VB = XMVectorAbs(VA);
+
+ VC = XMVectorNegativeMultiplySubtract(VA, C1, VC);
+
+ reinterpret_cast<__m128i *>(&VB)[0] = _mm_cvttps_epi32(VB);
+
+ VC2 = XMVectorMultiply(VC, VC);
+
+ T7 = XMVectorSplatW(TanCoefficients1);
+ T6 = XMVectorSplatZ(TanCoefficients1);
+ T4 = XMVectorSplatX(TanCoefficients1);
+ T3 = XMVectorSplatW(TanCoefficients0);
+ T5 = XMVectorSplatY(TanCoefficients1);
+ T2 = XMVectorSplatZ(TanCoefficients0);
+ T1 = XMVectorSplatY(TanCoefficients0);
+ T0 = XMVectorSplatX(TanCoefficients0);
+
+ VBIsEven = XMVectorAndInt(VB,Mask);
+ VBIsEven = XMVectorEqualInt(VBIsEven, Zero);
+
+ N = XMVectorMultiplyAdd(VC2, T7, T6);
+ D = XMVectorMultiplyAdd(VC2, T4, T3);
+ N = XMVectorMultiplyAdd(VC2, N, T5);
+ D = XMVectorMultiplyAdd(VC2, D, T2);
+ N = XMVectorMultiply(VC2, N);
+ D = XMVectorMultiplyAdd(VC2, D, T1);
+ N = XMVectorMultiplyAdd(VC, N, VC);
+ VCNearZero = XMVectorInBounds(VC, Epsilon);
+ D = XMVectorMultiplyAdd(VC2, D, T0);
+
+ N = XMVectorSelect(N, VC, VCNearZero);
+ D = XMVectorSelect(D, g_XMOne, VCNearZero);
+ R0 = XMVectorNegate(N);
+ R1 = _mm_div_ps(N,D);
+ R0 = _mm_div_ps(D,R0);
+ VIsZero = XMVectorEqual(V, Zero);
+ Result = XMVectorSelect(R0, R1, VBIsEven);
+ Result = XMVectorSelect(Result, Zero, VIsZero);
+
+ return Result;
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMVectorSinH
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V1, V2;
+ XMVECTOR E1, E2;
+ XMVECTOR Result;
+ static CONST XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
+
+ V1 = XMVectorMultiplyAdd(V, Scale.v, g_XMNegativeOne.v);
+ V2 = XMVectorNegativeMultiplySubtract(V, Scale.v, g_XMNegativeOne.v);
+
+ E1 = XMVectorExp(V1);
+ E2 = XMVectorExp(V2);
+
+ Result = XMVectorSubtract(E1, E2);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR V1, V2;
+ XMVECTOR E1, E2;
+ XMVECTOR Result;
+ static CONST XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
+
+ V1 = _mm_mul_ps(V, Scale);
+ V1 = _mm_add_ps(V1,g_XMNegativeOne);
+ V2 = _mm_mul_ps(V, Scale);
+ V2 = _mm_sub_ps(g_XMNegativeOne,V2);
+ E1 = XMVectorExp(V1);
+ E2 = XMVectorExp(V2);
+
+ Result = _mm_sub_ps(E1, E2);
+
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMVectorCosH
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V1, V2;
+ XMVECTOR E1, E2;
+ XMVECTOR Result;
+ static CONST XMVECTOR Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
+
+ V1 = XMVectorMultiplyAdd(V, Scale, g_XMNegativeOne.v);
+ V2 = XMVectorNegativeMultiplySubtract(V, Scale, g_XMNegativeOne.v);
+
+ E1 = XMVectorExp(V1);
+ E2 = XMVectorExp(V2);
+
+ Result = XMVectorAdd(E1, E2);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR V1, V2;
+ XMVECTOR E1, E2;
+ XMVECTOR Result;
+ static CONST XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
+
+ V1 = _mm_mul_ps(V,Scale);
+ V1 = _mm_add_ps(V1,g_XMNegativeOne);
+ V2 = _mm_mul_ps(V, Scale);
+ V2 = _mm_sub_ps(g_XMNegativeOne,V2);
+ E1 = XMVectorExp(V1);
+ E2 = XMVectorExp(V2);
+ Result = _mm_add_ps(E1, E2);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMVectorTanH
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR E;
+ XMVECTOR Result;
+ static CONST XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f)
+
+ E = XMVectorMultiply(V, Scale.v);
+ E = XMVectorExp(E);
+ E = XMVectorMultiplyAdd(E, g_XMOneHalf.v, g_XMOneHalf.v);
+ E = XMVectorReciprocal(E);
+
+ Result = XMVectorSubtract(g_XMOne.v, E);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static CONST XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f)
+
+ XMVECTOR E = _mm_mul_ps(V, Scale);
+ E = XMVectorExp(E);
+ E = _mm_mul_ps(E,g_XMOneHalf);
+ E = _mm_add_ps(E,g_XMOneHalf);
+ E = XMVectorReciprocal(E);
+ E = _mm_sub_ps(g_XMOne, E);
+ return E;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMVectorASin
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V2, V3, AbsV;
+ XMVECTOR C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
+ XMVECTOR R0, R1, R2, R3, R4;
+ XMVECTOR OneMinusAbsV;
+ XMVECTOR Rsq;
+ XMVECTOR Result;
+ static CONST XMVECTOR OnePlusEpsilon = {1.00000011921f, 1.00000011921f, 1.00000011921f, 1.00000011921f};
+
+ // asin(V) = V * (C0 + C1 * V + C2 * V^2 + C3 * V^3 + C4 * V^4 + C5 * V^5) + (1 - V) * rsq(1 - V) *
+ // V * (C6 + C7 * V + C8 * V^2 + C9 * V^3 + C10 * V^4 + C11 * V^5)
+
+ AbsV = XMVectorAbs(V);
+
+ V2 = XMVectorMultiply(V, V);
+ V3 = XMVectorMultiply(V2, AbsV);
+
+ R4 = XMVectorNegativeMultiplySubtract(AbsV, V, V);
+
+ OneMinusAbsV = XMVectorSubtract(OnePlusEpsilon, AbsV);
+ Rsq = XMVectorReciprocalSqrt(OneMinusAbsV);
+
+ C0 = XMVectorSplatX(g_XMASinCoefficients0.v);
+ C1 = XMVectorSplatY(g_XMASinCoefficients0.v);
+ C2 = XMVectorSplatZ(g_XMASinCoefficients0.v);
+ C3 = XMVectorSplatW(g_XMASinCoefficients0.v);
+
+ C4 = XMVectorSplatX(g_XMASinCoefficients1.v);
+ C5 = XMVectorSplatY(g_XMASinCoefficients1.v);
+ C6 = XMVectorSplatZ(g_XMASinCoefficients1.v);
+ C7 = XMVectorSplatW(g_XMASinCoefficients1.v);
+
+ C8 = XMVectorSplatX(g_XMASinCoefficients2.v);
+ C9 = XMVectorSplatY(g_XMASinCoefficients2.v);
+ C10 = XMVectorSplatZ(g_XMASinCoefficients2.v);
+ C11 = XMVectorSplatW(g_XMASinCoefficients2.v);
+
+ R0 = XMVectorMultiplyAdd(C3, AbsV, C7);
+ R1 = XMVectorMultiplyAdd(C1, AbsV, C5);
+ R2 = XMVectorMultiplyAdd(C2, AbsV, C6);
+ R3 = XMVectorMultiplyAdd(C0, AbsV, C4);
+
+ R0 = XMVectorMultiplyAdd(R0, AbsV, C11);
+ R1 = XMVectorMultiplyAdd(R1, AbsV, C9);
+ R2 = XMVectorMultiplyAdd(R2, AbsV, C10);
+ R3 = XMVectorMultiplyAdd(R3, AbsV, C8);
+
+ R0 = XMVectorMultiplyAdd(R2, V3, R0);
+ R1 = XMVectorMultiplyAdd(R3, V3, R1);
+
+ R0 = XMVectorMultiply(V, R0);
+ R1 = XMVectorMultiply(R4, R1);
+
+ Result = XMVectorMultiplyAdd(R1, Rsq, R0);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static CONST XMVECTORF32 OnePlusEpsilon = {1.00000011921f, 1.00000011921f, 1.00000011921f, 1.00000011921f};
+
+ // asin(V) = V * (C0 + C1 * V + C2 * V^2 + C3 * V^3 + C4 * V^4 + C5 * V^5) + (1 - V) * rsq(1 - V) *
+ // V * (C6 + C7 * V + C8 * V^2 + C9 * V^3 + C10 * V^4 + C11 * V^5)
+ // Get abs(V)
+ XMVECTOR vAbsV = _mm_setzero_ps();
+ vAbsV = _mm_sub_ps(vAbsV,V);
+ vAbsV = _mm_max_ps(vAbsV,V);
+
+ XMVECTOR R0 = vAbsV;
+ XMVECTOR vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[3]);
+ R0 = _mm_mul_ps(R0,vConstants);
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[3]);
+ R0 = _mm_add_ps(R0,vConstants);
+
+ XMVECTOR R1 = vAbsV;
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[1]);
+ R1 = _mm_mul_ps(R1,vConstants);
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[1]);
+ R1 = _mm_add_ps(R1, vConstants);
+
+ XMVECTOR R2 = vAbsV;
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[2]);
+ R2 = _mm_mul_ps(R2,vConstants);
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[2]);
+ R2 = _mm_add_ps(R2, vConstants);
+
+ XMVECTOR R3 = vAbsV;
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[0]);
+ R3 = _mm_mul_ps(R3,vConstants);
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[0]);
+ R3 = _mm_add_ps(R3, vConstants);
+
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[3]);
+ R0 = _mm_mul_ps(R0,vAbsV);
+ R0 = _mm_add_ps(R0,vConstants);
+
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[1]);
+ R1 = _mm_mul_ps(R1,vAbsV);
+ R1 = _mm_add_ps(R1,vConstants);
+
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[2]);
+ R2 = _mm_mul_ps(R2,vAbsV);
+ R2 = _mm_add_ps(R2,vConstants);
+
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[0]);
+ R3 = _mm_mul_ps(R3,vAbsV);
+ R3 = _mm_add_ps(R3,vConstants);
+
+ // V3 = V^3
+ vConstants = _mm_mul_ps(V,V);
+ vConstants = _mm_mul_ps(vConstants, vAbsV);
+ // Mul by V^3
+ R2 = _mm_mul_ps(R2,vConstants);
+ R3 = _mm_mul_ps(R3,vConstants);
+ // Merge the results
+ R0 = _mm_add_ps(R0,R2);
+ R1 = _mm_add_ps(R1,R3);
+
+ R0 = _mm_mul_ps(R0,V);
+ // vConstants = V-(V^2 retaining sign)
+ vConstants = _mm_mul_ps(vAbsV, V);
+ vConstants = _mm_sub_ps(V,vConstants);
+ R1 = _mm_mul_ps(R1,vConstants);
+ vConstants = _mm_sub_ps(OnePlusEpsilon,vAbsV);
+ // Do NOT use rsqrt/mul. This needs the precision
+ vConstants = _mm_sqrt_ps(vConstants);
+ R1 = _mm_div_ps(R1,vConstants);
+ R0 = _mm_add_ps(R0,R1);
+ return R0;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMVectorACos
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V2, V3, AbsV;
+ XMVECTOR C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
+ XMVECTOR R0, R1, R2, R3, R4;
+ XMVECTOR OneMinusAbsV;
+ XMVECTOR Rsq;
+ XMVECTOR Result;
+ static CONST XMVECTOR OnePlusEpsilon = {1.00000011921f, 1.00000011921f, 1.00000011921f, 1.00000011921f};
+
+ // acos(V) = PI / 2 - asin(V)
+
+ AbsV = XMVectorAbs(V);
+
+ V2 = XMVectorMultiply(V, V);
+ V3 = XMVectorMultiply(V2, AbsV);
+
+ R4 = XMVectorNegativeMultiplySubtract(AbsV, V, V);
+
+ OneMinusAbsV = XMVectorSubtract(OnePlusEpsilon, AbsV);
+ Rsq = XMVectorReciprocalSqrt(OneMinusAbsV);
+
+ C0 = XMVectorSplatX(g_XMASinCoefficients0.v);
+ C1 = XMVectorSplatY(g_XMASinCoefficients0.v);
+ C2 = XMVectorSplatZ(g_XMASinCoefficients0.v);
+ C3 = XMVectorSplatW(g_XMASinCoefficients0.v);
+
+ C4 = XMVectorSplatX(g_XMASinCoefficients1.v);
+ C5 = XMVectorSplatY(g_XMASinCoefficients1.v);
+ C6 = XMVectorSplatZ(g_XMASinCoefficients1.v);
+ C7 = XMVectorSplatW(g_XMASinCoefficients1.v);
+
+ C8 = XMVectorSplatX(g_XMASinCoefficients2.v);
+ C9 = XMVectorSplatY(g_XMASinCoefficients2.v);
+ C10 = XMVectorSplatZ(g_XMASinCoefficients2.v);
+ C11 = XMVectorSplatW(g_XMASinCoefficients2.v);
+
+ R0 = XMVectorMultiplyAdd(C3, AbsV, C7);
+ R1 = XMVectorMultiplyAdd(C1, AbsV, C5);
+ R2 = XMVectorMultiplyAdd(C2, AbsV, C6);
+ R3 = XMVectorMultiplyAdd(C0, AbsV, C4);
+
+ R0 = XMVectorMultiplyAdd(R0, AbsV, C11);
+ R1 = XMVectorMultiplyAdd(R1, AbsV, C9);
+ R2 = XMVectorMultiplyAdd(R2, AbsV, C10);
+ R3 = XMVectorMultiplyAdd(R3, AbsV, C8);
+
+ R0 = XMVectorMultiplyAdd(R2, V3, R0);
+ R1 = XMVectorMultiplyAdd(R3, V3, R1);
+
+ R0 = XMVectorMultiply(V, R0);
+ R1 = XMVectorMultiply(R4, R1);
+
+ Result = XMVectorMultiplyAdd(R1, Rsq, R0);
+
+ Result = XMVectorSubtract(g_XMHalfPi.v, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static CONST XMVECTORF32 OnePlusEpsilon = {1.00000011921f, 1.00000011921f, 1.00000011921f, 1.00000011921f};
+ // Uses only 6 registers for good code on x86 targets
+ // acos(V) = PI / 2 - asin(V)
+ // Get abs(V)
+ XMVECTOR vAbsV = _mm_setzero_ps();
+ vAbsV = _mm_sub_ps(vAbsV,V);
+ vAbsV = _mm_max_ps(vAbsV,V);
+ // Perform the series in precision groups to
+ // retain precision across 20 bits. (3 bits of imprecision due to operations)
+ XMVECTOR R0 = vAbsV;
+ XMVECTOR vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[3]);
+ R0 = _mm_mul_ps(R0,vConstants);
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[3]);
+ R0 = _mm_add_ps(R0,vConstants);
+ R0 = _mm_mul_ps(R0,vAbsV);
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[3]);
+ R0 = _mm_add_ps(R0,vConstants);
+
+ XMVECTOR R1 = vAbsV;
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[1]);
+ R1 = _mm_mul_ps(R1,vConstants);
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[1]);
+ R1 = _mm_add_ps(R1,vConstants);
+ R1 = _mm_mul_ps(R1, vAbsV);
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[1]);
+ R1 = _mm_add_ps(R1,vConstants);
+
+ XMVECTOR R2 = vAbsV;
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[2]);
+ R2 = _mm_mul_ps(R2,vConstants);
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[2]);
+ R2 = _mm_add_ps(R2,vConstants);
+ R2 = _mm_mul_ps(R2, vAbsV);
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[2]);
+ R2 = _mm_add_ps(R2,vConstants);
+
+ XMVECTOR R3 = vAbsV;
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients0.f[0]);
+ R3 = _mm_mul_ps(R3,vConstants);
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients1.f[0]);
+ R3 = _mm_add_ps(R3,vConstants);
+ R3 = _mm_mul_ps(R3, vAbsV);
+ vConstants = _mm_load_ps1(&g_XMASinCoefficients2.f[0]);
+ R3 = _mm_add_ps(R3,vConstants);
+
+ // vConstants = V^3
+ vConstants = _mm_mul_ps(V,V);
+ vConstants = _mm_mul_ps(vConstants,vAbsV);
+ R2 = _mm_mul_ps(R2,vConstants);
+ R3 = _mm_mul_ps(R3,vConstants);
+ // Add the pair of values together here to retain
+ // as much precision as possible
+ R0 = _mm_add_ps(R0,R2);
+ R1 = _mm_add_ps(R1,R3);
+
+ R0 = _mm_mul_ps(R0,V);
+ // vConstants = V-(V*abs(V))
+ vConstants = _mm_mul_ps(V,vAbsV);
+ vConstants = _mm_sub_ps(V,vConstants);
+ R1 = _mm_mul_ps(R1,vConstants);
+ // Episilon exists to allow 1.0 as an answer
+ vConstants = _mm_sub_ps(OnePlusEpsilon, vAbsV);
+ // Use sqrt instead of rsqrt for precision
+ vConstants = _mm_sqrt_ps(vConstants);
+ R1 = _mm_div_ps(R1,vConstants);
+ R1 = _mm_add_ps(R1,R0);
+ vConstants = _mm_sub_ps(g_XMHalfPi,R1);
+ return vConstants;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMVectorATan
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ // Cody and Waite algorithm to compute inverse tangent.
+
+ XMVECTOR N, D;
+ XMVECTOR VF, G, ReciprocalF, AbsF, FA, FB;
+ XMVECTOR Sqrt3, Sqrt3MinusOne, TwoMinusSqrt3;
+ XMVECTOR HalfPi, OneThirdPi, OneSixthPi, Epsilon, MinV, MaxV;
+ XMVECTOR Zero;
+ XMVECTOR NegativeHalfPi;
+ XMVECTOR Angle1, Angle2;
+ XMVECTOR F_GT_One, F_GT_TwoMinusSqrt3, AbsF_LT_Epsilon, V_LT_Zero, V_GT_MaxV, V_LT_MinV;
+ XMVECTOR NegativeResult, Result;
+ XMVECTOR P0, P1, P2, P3, Q0, Q1, Q2, Q3;
+ static CONST XMVECTOR ATanConstants0 = {-1.3688768894e+1f, -2.0505855195e+1f, -8.4946240351f, -8.3758299368e-1f};
+ static CONST XMVECTOR ATanConstants1 = {4.1066306682e+1f, 8.6157349597e+1f, 5.9578436142e+1f, 1.5024001160e+1f};
+ static CONST XMVECTOR ATanConstants2 = {1.732050808f, 7.320508076e-1f, 2.679491924e-1f, 0.000244140625f}; // <sqrt(3), sqrt(3) - 1, 2 - sqrt(3), Epsilon>
+ static CONST XMVECTOR ATanConstants3 = {XM_PIDIV2, XM_PI / 3.0f, XM_PI / 6.0f, 8.507059173e+37f}; // <Pi / 2, Pi / 3, Pi / 6, MaxV>
+
+ Zero = XMVectorZero();
+
+ P0 = XMVectorSplatX(ATanConstants0);
+ P1 = XMVectorSplatY(ATanConstants0);
+ P2 = XMVectorSplatZ(ATanConstants0);
+ P3 = XMVectorSplatW(ATanConstants0);
+
+ Q0 = XMVectorSplatX(ATanConstants1);
+ Q1 = XMVectorSplatY(ATanConstants1);
+ Q2 = XMVectorSplatZ(ATanConstants1);
+ Q3 = XMVectorSplatW(ATanConstants1);
+
+ Sqrt3 = XMVectorSplatX(ATanConstants2);
+ Sqrt3MinusOne = XMVectorSplatY(ATanConstants2);
+ TwoMinusSqrt3 = XMVectorSplatZ(ATanConstants2);
+ Epsilon = XMVectorSplatW(ATanConstants2);
+
+ HalfPi = XMVectorSplatX(ATanConstants3);
+ OneThirdPi = XMVectorSplatY(ATanConstants3);
+ OneSixthPi = XMVectorSplatZ(ATanConstants3);
+ MaxV = XMVectorSplatW(ATanConstants3);
+
+ VF = XMVectorAbs(V);
+ ReciprocalF = XMVectorReciprocal(VF);
+
+ F_GT_One = XMVectorGreater(VF, g_XMOne.v);
+
+ VF = XMVectorSelect(VF, ReciprocalF, F_GT_One);
+ Angle1 = XMVectorSelect(Zero, HalfPi, F_GT_One);
+ Angle2 = XMVectorSelect(OneSixthPi, OneThirdPi, F_GT_One);
+
+ F_GT_TwoMinusSqrt3 = XMVectorGreater(VF, TwoMinusSqrt3);
+
+ FA = XMVectorMultiplyAdd(Sqrt3MinusOne, VF, VF);
+ FA = XMVectorAdd(FA, g_XMNegativeOne.v);
+ FB = XMVectorAdd(VF, Sqrt3);
+ FB = XMVectorReciprocal(FB);
+ FA = XMVectorMultiply(FA, FB);
+
+ VF = XMVectorSelect(VF, FA, F_GT_TwoMinusSqrt3);
+ Angle1 = XMVectorSelect(Angle1, Angle2, F_GT_TwoMinusSqrt3);
+
+ AbsF = XMVectorAbs(VF);
+ AbsF_LT_Epsilon = XMVectorLess(AbsF, Epsilon);
+
+ G = XMVectorMultiply(VF, VF);
+
+ D = XMVectorAdd(G, Q3);
+ D = XMVectorMultiplyAdd(D, G, Q2);
+ D = XMVectorMultiplyAdd(D, G, Q1);
+ D = XMVectorMultiplyAdd(D, G, Q0);
+ D = XMVectorReciprocal(D);
+
+ N = XMVectorMultiplyAdd(P3, G, P2);
+ N = XMVectorMultiplyAdd(N, G, P1);
+ N = XMVectorMultiplyAdd(N, G, P0);
+ N = XMVectorMultiply(N, G);
+ Result = XMVectorMultiply(N, D);
+
+ Result = XMVectorMultiplyAdd(Result, VF, VF);
+
+ Result = XMVectorSelect(Result, VF, AbsF_LT_Epsilon);
+
+ NegativeResult = XMVectorNegate(Result);
+ Result = XMVectorSelect(Result, NegativeResult, F_GT_One);
+
+ Result = XMVectorAdd(Result, Angle1);
+
+ V_LT_Zero = XMVectorLess(V, Zero);
+ NegativeResult = XMVectorNegate(Result);
+ Result = XMVectorSelect(Result, NegativeResult, V_LT_Zero);
+
+ MinV = XMVectorNegate(MaxV);
+ NegativeHalfPi = XMVectorNegate(HalfPi);
+ V_GT_MaxV = XMVectorGreater(V, MaxV);
+ V_LT_MinV = XMVectorLess(V, MinV);
+ Result = XMVectorSelect(Result, g_XMHalfPi.v, V_GT_MaxV);
+ Result = XMVectorSelect(Result, NegativeHalfPi, V_LT_MinV);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static CONST XMVECTORF32 ATanConstants0 = {-1.3688768894e+1f, -2.0505855195e+1f, -8.4946240351f, -8.3758299368e-1f};
+ static CONST XMVECTORF32 ATanConstants1 = {4.1066306682e+1f, 8.6157349597e+1f, 5.9578436142e+1f, 1.5024001160e+1f};
+ static CONST XMVECTORF32 ATanConstants2 = {1.732050808f, 7.320508076e-1f, 2.679491924e-1f, 0.000244140625f}; // <sqrt(3), sqrt(3) - 1, 2 - sqrt(3), Epsilon>
+ static CONST XMVECTORF32 ATanConstants3 = {XM_PIDIV2, XM_PI / 3.0f, XM_PI / 6.0f, 8.507059173e+37f}; // <Pi / 2, Pi / 3, Pi / 6, MaxV>
+
+ XMVECTOR VF = XMVectorAbs(V);
+ XMVECTOR F_GT_One = _mm_cmpgt_ps(VF,g_XMOne);
+ XMVECTOR ReciprocalF = XMVectorReciprocal(VF);
+ VF = XMVectorSelect(VF, ReciprocalF, F_GT_One);
+ XMVECTOR Zero = XMVectorZero();
+ XMVECTOR HalfPi = _mm_load_ps1(&ATanConstants3.f[0]);
+ XMVECTOR Angle1 = XMVectorSelect(Zero, HalfPi, F_GT_One);
+ // Pi/3
+ XMVECTOR vConstants = _mm_load_ps1(&ATanConstants3.f[1]);
+ // Pi/6
+ XMVECTOR Angle2 = _mm_load_ps1(&ATanConstants3.f[2]);
+ Angle2 = XMVectorSelect(Angle2, vConstants, F_GT_One);
+
+ // 1-sqrt(3)
+ XMVECTOR FA = _mm_load_ps1(&ATanConstants2.f[1]);
+ FA = _mm_mul_ps(FA,VF);
+ FA = _mm_add_ps(FA,VF);
+ FA = _mm_add_ps(FA,g_XMNegativeOne);
+ // sqrt(3)
+ vConstants = _mm_load_ps1(&ATanConstants2.f[0]);
+ vConstants = _mm_add_ps(vConstants,VF);
+ FA = _mm_div_ps(FA,vConstants);
+
+ // 2-sqrt(3)
+ vConstants = _mm_load_ps1(&ATanConstants2.f[2]);
+ // >2-sqrt(3)?
+ vConstants = _mm_cmpgt_ps(VF,vConstants);
+ VF = XMVectorSelect(VF, FA, vConstants);
+ Angle1 = XMVectorSelect(Angle1, Angle2, vConstants);
+
+ XMVECTOR AbsF = XMVectorAbs(VF);
+
+ XMVECTOR G = _mm_mul_ps(VF,VF);
+ XMVECTOR D = _mm_load_ps1(&ATanConstants1.f[3]);
+ D = _mm_add_ps(D,G);
+ D = _mm_mul_ps(D,G);
+ vConstants = _mm_load_ps1(&ATanConstants1.f[2]);
+ D = _mm_add_ps(D,vConstants);
+ D = _mm_mul_ps(D,G);
+ vConstants = _mm_load_ps1(&ATanConstants1.f[1]);
+ D = _mm_add_ps(D,vConstants);
+ D = _mm_mul_ps(D,G);
+ vConstants = _mm_load_ps1(&ATanConstants1.f[0]);
+ D = _mm_add_ps(D,vConstants);
+
+ XMVECTOR N = _mm_load_ps1(&ATanConstants0.f[3]);
+ N = _mm_mul_ps(N,G);
+ vConstants = _mm_load_ps1(&ATanConstants0.f[2]);
+ N = _mm_add_ps(N,vConstants);
+ N = _mm_mul_ps(N,G);
+ vConstants = _mm_load_ps1(&ATanConstants0.f[1]);
+ N = _mm_add_ps(N,vConstants);
+ N = _mm_mul_ps(N,G);
+ vConstants = _mm_load_ps1(&ATanConstants0.f[0]);
+ N = _mm_add_ps(N,vConstants);
+ N = _mm_mul_ps(N,G);
+ XMVECTOR Result = _mm_div_ps(N,D);
+
+ Result = _mm_mul_ps(Result,VF);
+ Result = _mm_add_ps(Result,VF);
+ // Epsilon
+ vConstants = _mm_load_ps1(&ATanConstants2.f[3]);
+ vConstants = _mm_cmpge_ps(vConstants,AbsF);
+ Result = XMVectorSelect(Result,VF,vConstants);
+
+ XMVECTOR NegativeResult = _mm_mul_ps(Result,g_XMNegativeOne);
+ Result = XMVectorSelect(Result,NegativeResult,F_GT_One);
+ Result = _mm_add_ps(Result,Angle1);
+
+ Zero = _mm_cmpge_ps(Zero,V);
+ NegativeResult = _mm_mul_ps(Result,g_XMNegativeOne);
+ Result = XMVectorSelect(Result,NegativeResult,Zero);
+
+ XMVECTOR MaxV = _mm_load_ps1(&ATanConstants3.f[3]);
+ XMVECTOR MinV = _mm_mul_ps(MaxV,g_XMNegativeOne);
+ // Negate HalfPi
+ HalfPi = _mm_mul_ps(HalfPi,g_XMNegativeOne);
+ MaxV = _mm_cmple_ps(MaxV,V);
+ MinV = _mm_cmpge_ps(MinV,V);
+ Result = XMVectorSelect(Result,g_XMHalfPi,MaxV);
+ // HalfPi = -HalfPi
+ Result = XMVectorSelect(Result,HalfPi,MinV);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMVectorATan2
+(
+ FXMVECTOR Y,
+ FXMVECTOR X
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions:
+
+ // Y == 0 and X is Negative -> Pi with the sign of Y
+ // y == 0 and x is positive -> 0 with the sign of y
+ // Y != 0 and X == 0 -> Pi / 2 with the sign of Y
+ // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y)
+ // X == -Infinity and Finite Y -> Pi with the sign of Y
+ // X == +Infinity and Finite Y -> 0 with the sign of Y
+ // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y
+ // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y
+ // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y
+
+ XMVECTOR Reciprocal;
+ XMVECTOR V;
+ XMVECTOR YSign;
+ XMVECTOR Pi, PiOverTwo, PiOverFour, ThreePiOverFour;
+ XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity;
+ XMVECTOR ATanResultValid;
+ XMVECTOR R0, R1, R2, R3, R4, R5;
+ XMVECTOR Zero;
+ XMVECTOR Result;
+ static CONST XMVECTOR ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
+
+ Zero = XMVectorZero();
+ ATanResultValid = XMVectorTrueInt();
+
+ Pi = XMVectorSplatX(ATan2Constants);
+ PiOverTwo = XMVectorSplatY(ATan2Constants);
+ PiOverFour = XMVectorSplatZ(ATan2Constants);
+ ThreePiOverFour = XMVectorSplatW(ATan2Constants);
+
+ YEqualsZero = XMVectorEqual(Y, Zero);
+ XEqualsZero = XMVectorEqual(X, Zero);
+ XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
+ XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
+ YEqualsInfinity = XMVectorIsInfinite(Y);
+ XEqualsInfinity = XMVectorIsInfinite(X);
+
+ YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
+ Pi = XMVectorOrInt(Pi, YSign);
+ PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
+ PiOverFour = XMVectorOrInt(PiOverFour, YSign);
+ ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
+
+ R1 = XMVectorSelect(Pi, YSign, XIsPositive);
+ R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
+ R3 = XMVectorSelect(R2, R1, YEqualsZero);
+ R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
+ R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
+ Result = XMVectorSelect(R3, R5, YEqualsInfinity);
+ ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
+
+ Reciprocal = XMVectorReciprocal(X);
+ V = XMVectorMultiply(Y, Reciprocal);
+ R0 = XMVectorATan(V);
+
+ R1 = XMVectorSelect( Pi, Zero, XIsPositive );
+ R2 = XMVectorAdd(R0, R1);
+
+ Result = XMVectorSelect(Result, R2, ATanResultValid);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static CONST XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
+
+ // Mask if Y>0 && Y!=INF
+ XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
+ // Get the sign of (Y&0x80000000)
+ XMVECTOR YSign = _mm_and_ps(Y, g_XMNegativeZero);
+ // Get the sign bits of X
+ XMVECTOR XIsPositive = _mm_and_ps(X,g_XMNegativeZero);
+ // Change them to masks
+ XIsPositive = XMVectorEqualInt(XIsPositive,g_XMZero);
+ // Get Pi
+ XMVECTOR Pi = _mm_load_ps1(&ATan2Constants.f[0]);
+ // Copy the sign of Y
+ Pi = _mm_or_ps(Pi,YSign);
+ XMVECTOR R1 = XMVectorSelect(Pi,YSign,XIsPositive);
+ // Mask for X==0
+ XMVECTOR vConstants = _mm_cmpeq_ps(X,g_XMZero);
+ // Get Pi/2 with with sign of Y
+ XMVECTOR PiOverTwo = _mm_load_ps1(&ATan2Constants.f[1]);
+ PiOverTwo = _mm_or_ps(PiOverTwo,YSign);
+ XMVECTOR R2 = XMVectorSelect(g_XMNegOneMask,PiOverTwo,vConstants);
+ // Mask for Y==0
+ vConstants = _mm_cmpeq_ps(Y,g_XMZero);
+ R2 = XMVectorSelect(R2,R1,vConstants);
+ // Get Pi/4 with sign of Y
+ XMVECTOR PiOverFour = _mm_load_ps1(&ATan2Constants.f[2]);
+ PiOverFour = _mm_or_ps(PiOverFour,YSign);
+ // Get (Pi*3)/4 with sign of Y
+ XMVECTOR ThreePiOverFour = _mm_load_ps1(&ATan2Constants.f[3]);
+ ThreePiOverFour = _mm_or_ps(ThreePiOverFour,YSign);
+ vConstants = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
+ XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
+ vConstants = XMVectorSelect(PiOverTwo,vConstants,XEqualsInfinity);
+
+ XMVECTOR vResult = XMVectorSelect(R2,vConstants,YEqualsInfinity);
+ vConstants = XMVectorSelect(R1,vResult,YEqualsInfinity);
+ // At this point, any entry that's zero will get the result
+ // from XMVectorATan(), otherwise, return the failsafe value
+ vResult = XMVectorSelect(vResult,vConstants,XEqualsInfinity);
+ // Any entries not 0xFFFFFFFF, are considered precalculated
+ XMVECTOR ATanResultValid = XMVectorEqualInt(vResult,g_XMNegOneMask);
+ // Let's do the ATan2 function
+ vConstants = _mm_div_ps(Y,X);
+ vConstants = XMVectorATan(vConstants);
+ // Discard entries that have been declared void
+
+ XMVECTOR R3 = XMVectorSelect( Pi, g_XMZero, XIsPositive );
+ vConstants = _mm_add_ps( vConstants, R3 );
+
+ vResult = XMVectorSelect(vResult,vConstants,ATanResultValid);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorSinEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V2, V3, V5, V7;
+ XMVECTOR S1, S2, S3;
+ XMVECTOR Result;
+
+ // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI)
+ V2 = XMVectorMultiply(V, V);
+ V3 = XMVectorMultiply(V2, V);
+ V5 = XMVectorMultiply(V3, V2);
+ V7 = XMVectorMultiply(V5, V2);
+
+ S1 = XMVectorSplatY(g_XMSinEstCoefficients.v);
+ S2 = XMVectorSplatZ(g_XMSinEstCoefficients.v);
+ S3 = XMVectorSplatW(g_XMSinEstCoefficients.v);
+
+ Result = XMVectorMultiplyAdd(S1, V3, V);
+ Result = XMVectorMultiplyAdd(S2, V5, Result);
+ Result = XMVectorMultiplyAdd(S3, V7, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI)
+ XMVECTOR V2 = _mm_mul_ps(V,V);
+ XMVECTOR V3 = _mm_mul_ps(V2,V);
+ XMVECTOR vResult = _mm_load_ps1(&g_XMSinEstCoefficients.f[1]);
+ vResult = _mm_mul_ps(vResult,V3);
+ vResult = _mm_add_ps(vResult,V);
+ XMVECTOR vConstants = _mm_load_ps1(&g_XMSinEstCoefficients.f[2]);
+ // V^5
+ V3 = _mm_mul_ps(V3,V2);
+ vConstants = _mm_mul_ps(vConstants,V3);
+ vResult = _mm_add_ps(vResult,vConstants);
+ vConstants = _mm_load_ps1(&g_XMSinEstCoefficients.f[3]);
+ // V^7
+ V3 = _mm_mul_ps(V3,V2);
+ vConstants = _mm_mul_ps(vConstants,V3);
+ vResult = _mm_add_ps(vResult,vConstants);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorCosEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V2, V4, V6;
+ XMVECTOR C0, C1, C2, C3;
+ XMVECTOR Result;
+
+ V2 = XMVectorMultiply(V, V);
+ V4 = XMVectorMultiply(V2, V2);
+ V6 = XMVectorMultiply(V4, V2);
+
+ C0 = XMVectorSplatX(g_XMCosEstCoefficients.v);
+ C1 = XMVectorSplatY(g_XMCosEstCoefficients.v);
+ C2 = XMVectorSplatZ(g_XMCosEstCoefficients.v);
+ C3 = XMVectorSplatW(g_XMCosEstCoefficients.v);
+
+ Result = XMVectorMultiplyAdd(C1, V2, C0);
+ Result = XMVectorMultiplyAdd(C2, V4, Result);
+ Result = XMVectorMultiplyAdd(C3, V6, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Get V^2
+ XMVECTOR V2 = _mm_mul_ps(V,V);
+ XMVECTOR vResult = _mm_load_ps1(&g_XMCosEstCoefficients.f[1]);
+ vResult = _mm_mul_ps(vResult,V2);
+ XMVECTOR vConstants = _mm_load_ps1(&g_XMCosEstCoefficients.f[0]);
+ vResult = _mm_add_ps(vResult,vConstants);
+ vConstants = _mm_load_ps1(&g_XMCosEstCoefficients.f[2]);
+ // Get V^4
+ XMVECTOR V4 = _mm_mul_ps(V2, V2);
+ vConstants = _mm_mul_ps(vConstants,V4);
+ vResult = _mm_add_ps(vResult,vConstants);
+ vConstants = _mm_load_ps1(&g_XMCosEstCoefficients.f[3]);
+ // It's really V^6
+ V4 = _mm_mul_ps(V4,V2);
+ vConstants = _mm_mul_ps(vConstants,V4);
+ vResult = _mm_add_ps(vResult,vConstants);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMVectorSinCosEst
+(
+ XMVECTOR* pSin,
+ XMVECTOR* pCos,
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V2, V3, V4, V5, V6, V7;
+ XMVECTOR S1, S2, S3;
+ XMVECTOR C0, C1, C2, C3;
+ XMVECTOR Sin, Cos;
+
+ XMASSERT(pSin);
+ XMASSERT(pCos);
+
+ // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI)
+ // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! (for -PI <= V < PI)
+ V2 = XMVectorMultiply(V, V);
+ V3 = XMVectorMultiply(V2, V);
+ V4 = XMVectorMultiply(V2, V2);
+ V5 = XMVectorMultiply(V3, V2);
+ V6 = XMVectorMultiply(V3, V3);
+ V7 = XMVectorMultiply(V4, V3);
+
+ S1 = XMVectorSplatY(g_XMSinEstCoefficients.v);
+ S2 = XMVectorSplatZ(g_XMSinEstCoefficients.v);
+ S3 = XMVectorSplatW(g_XMSinEstCoefficients.v);
+
+ C0 = XMVectorSplatX(g_XMCosEstCoefficients.v);
+ C1 = XMVectorSplatY(g_XMCosEstCoefficients.v);
+ C2 = XMVectorSplatZ(g_XMCosEstCoefficients.v);
+ C3 = XMVectorSplatW(g_XMCosEstCoefficients.v);
+
+ Sin = XMVectorMultiplyAdd(S1, V3, V);
+ Sin = XMVectorMultiplyAdd(S2, V5, Sin);
+ Sin = XMVectorMultiplyAdd(S3, V7, Sin);
+
+ Cos = XMVectorMultiplyAdd(C1, V2, C0);
+ Cos = XMVectorMultiplyAdd(C2, V4, Cos);
+ Cos = XMVectorMultiplyAdd(C3, V6, Cos);
+
+ *pSin = Sin;
+ *pCos = Cos;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pSin);
+ XMASSERT(pCos);
+ XMVECTOR V2, V3, V4, V5, V6, V7;
+ XMVECTOR S1, S2, S3;
+ XMVECTOR C0, C1, C2, C3;
+ XMVECTOR Sin, Cos;
+
+ // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! (for -PI <= V < PI)
+ // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! (for -PI <= V < PI)
+ V2 = XMVectorMultiply(V, V);
+ V3 = XMVectorMultiply(V2, V);
+ V4 = XMVectorMultiply(V2, V2);
+ V5 = XMVectorMultiply(V3, V2);
+ V6 = XMVectorMultiply(V3, V3);
+ V7 = XMVectorMultiply(V4, V3);
+
+ S1 = _mm_load_ps1(&g_XMSinEstCoefficients.f[1]);
+ S2 = _mm_load_ps1(&g_XMSinEstCoefficients.f[2]);
+ S3 = _mm_load_ps1(&g_XMSinEstCoefficients.f[3]);
+
+ C0 = _mm_load_ps1(&g_XMCosEstCoefficients.f[0]);
+ C1 = _mm_load_ps1(&g_XMCosEstCoefficients.f[1]);
+ C2 = _mm_load_ps1(&g_XMCosEstCoefficients.f[2]);
+ C3 = _mm_load_ps1(&g_XMCosEstCoefficients.f[3]);
+
+ Sin = XMVectorMultiplyAdd(S1, V3, V);
+ Sin = XMVectorMultiplyAdd(S2, V5, Sin);
+ Sin = XMVectorMultiplyAdd(S3, V7, Sin);
+
+ Cos = XMVectorMultiplyAdd(C1, V2, C0);
+ Cos = XMVectorMultiplyAdd(C2, V4, Cos);
+ Cos = XMVectorMultiplyAdd(C3, V6, Cos);
+
+ *pSin = Sin;
+ *pCos = Cos;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorTanEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V1, V2, V1T0, V1T1, V2T2;
+ XMVECTOR T0, T1, T2;
+ XMVECTOR N, D;
+ XMVECTOR OneOverPi;
+ XMVECTOR Result;
+
+ OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v);
+
+ V1 = XMVectorMultiply(V, OneOverPi);
+ V1 = XMVectorRound(V1);
+
+ V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V);
+
+ T0 = XMVectorSplatX(g_XMTanEstCoefficients.v);
+ T1 = XMVectorSplatY(g_XMTanEstCoefficients.v);
+ T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v);
+
+ V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2);
+ V2 = XMVectorMultiply(V1, V1);
+ V1T0 = XMVectorMultiply(V1, T0);
+ V1T1 = XMVectorMultiply(V1, T1);
+
+ D = XMVectorReciprocalEst(V2T2);
+ N = XMVectorMultiplyAdd(V2, V1T1, V1T0);
+
+ Result = XMVectorMultiply(N, D);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR V1, V2, V1T0, V1T1, V2T2;
+ XMVECTOR T0, T1, T2;
+ XMVECTOR N, D;
+ XMVECTOR OneOverPi;
+ XMVECTOR Result;
+
+ OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients);
+
+ V1 = XMVectorMultiply(V, OneOverPi);
+ V1 = XMVectorRound(V1);
+
+ V1 = XMVectorNegativeMultiplySubtract(g_XMPi, V1, V);
+
+ T0 = XMVectorSplatX(g_XMTanEstCoefficients);
+ T1 = XMVectorSplatY(g_XMTanEstCoefficients);
+ T2 = XMVectorSplatZ(g_XMTanEstCoefficients);
+
+ V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2);
+ V2 = XMVectorMultiply(V1, V1);
+ V1T0 = XMVectorMultiply(V1, T0);
+ V1T1 = XMVectorMultiply(V1, T1);
+
+ D = XMVectorReciprocalEst(V2T2);
+ N = XMVectorMultiplyAdd(V2, V1T1, V1T0);
+
+ Result = XMVectorMultiply(N, D);
+
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorSinHEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V1, V2;
+ XMVECTOR E1, E2;
+ XMVECTOR Result;
+ static CONST XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
+
+ V1 = XMVectorMultiplyAdd(V, Scale.v, g_XMNegativeOne.v);
+ V2 = XMVectorNegativeMultiplySubtract(V, Scale.v, g_XMNegativeOne.v);
+
+ E1 = XMVectorExpEst(V1);
+ E2 = XMVectorExpEst(V2);
+
+ Result = XMVectorSubtract(E1, E2);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR V1, V2;
+ XMVECTOR E1, E2;
+ XMVECTOR Result;
+ static CONST XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
+
+ V1 = _mm_mul_ps(V,Scale);
+ V1 = _mm_add_ps(V1,g_XMNegativeOne);
+ V2 = _mm_mul_ps(V,Scale);
+ V2 = _mm_sub_ps(g_XMNegativeOne,V2);
+ E1 = XMVectorExpEst(V1);
+ E2 = XMVectorExpEst(V2);
+ Result = _mm_sub_ps(E1, E2);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorCosHEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V1, V2;
+ XMVECTOR E1, E2;
+ XMVECTOR Result;
+ static CONST XMVECTOR Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
+
+ V1 = XMVectorMultiplyAdd(V, Scale, g_XMNegativeOne.v);
+ V2 = XMVectorNegativeMultiplySubtract(V, Scale, g_XMNegativeOne.v);
+
+ E1 = XMVectorExpEst(V1);
+ E2 = XMVectorExpEst(V2);
+
+ Result = XMVectorAdd(E1, E2);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR V1, V2;
+ XMVECTOR E1, E2;
+ XMVECTOR Result;
+ static CONST XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
+
+ V1 = _mm_mul_ps(V,Scale);
+ V1 = _mm_add_ps(V1,g_XMNegativeOne);
+ V2 = _mm_mul_ps(V, Scale);
+ V2 = _mm_sub_ps(g_XMNegativeOne,V2);
+ E1 = XMVectorExpEst(V1);
+ E2 = XMVectorExpEst(V2);
+ Result = _mm_add_ps(E1, E2);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorTanHEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR E;
+ XMVECTOR Result;
+ static CONST XMVECTOR Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f)
+
+ E = XMVectorMultiply(V, Scale);
+ E = XMVectorExpEst(E);
+ E = XMVectorMultiplyAdd(E, g_XMOneHalf.v, g_XMOneHalf.v);
+ E = XMVectorReciprocalEst(E);
+
+ Result = XMVectorSubtract(g_XMOne.v, E);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static CONST XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f)
+
+ XMVECTOR E = _mm_mul_ps(V, Scale);
+ E = XMVectorExpEst(E);
+ E = _mm_mul_ps(E,g_XMOneHalf);
+ E = _mm_add_ps(E,g_XMOneHalf);
+ E = XMVectorReciprocalEst(E);
+ E = _mm_sub_ps(g_XMOne, E);
+ return E;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorASinEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR AbsV, V2, VD, VC0, V2C3;
+ XMVECTOR C0, C1, C2, C3;
+ XMVECTOR D, Rsq, SqrtD;
+ XMVECTOR OnePlusEps;
+ XMVECTOR Result;
+
+ AbsV = XMVectorAbs(V);
+
+ OnePlusEps = XMVectorSplatX(g_XMASinEstConstants.v);
+
+ C0 = XMVectorSplatX(g_XMASinEstCoefficients.v);
+ C1 = XMVectorSplatY(g_XMASinEstCoefficients.v);
+ C2 = XMVectorSplatZ(g_XMASinEstCoefficients.v);
+ C3 = XMVectorSplatW(g_XMASinEstCoefficients.v);
+
+ D = XMVectorSubtract(OnePlusEps, AbsV);
+
+ Rsq = XMVectorReciprocalSqrtEst(D);
+ SqrtD = XMVectorMultiply(D, Rsq);
+
+ V2 = XMVectorMultiply(V, AbsV);
+ V2C3 = XMVectorMultiply(V2, C3);
+ VD = XMVectorMultiply(D, AbsV);
+ VC0 = XMVectorMultiply(V, C0);
+
+ Result = XMVectorMultiply(V, C1);
+ Result = XMVectorMultiplyAdd(V2, C2, Result);
+ Result = XMVectorMultiplyAdd(V2C3, VD, Result);
+ Result = XMVectorMultiplyAdd(VC0, SqrtD, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Get abs(V)
+ XMVECTOR vAbsV = _mm_setzero_ps();
+ vAbsV = _mm_sub_ps(vAbsV,V);
+ vAbsV = _mm_max_ps(vAbsV,V);
+
+ XMVECTOR D = _mm_load_ps1(&g_XMASinEstConstants.f[0]);
+ D = _mm_sub_ps(D,vAbsV);
+ // Since this is an estimate, rqsrt is okay
+ XMVECTOR vConstants = _mm_rsqrt_ps(D);
+ XMVECTOR SqrtD = _mm_mul_ps(D,vConstants);
+ // V2 = V^2 retaining sign
+ XMVECTOR V2 = _mm_mul_ps(V,vAbsV);
+ D = _mm_mul_ps(D,vAbsV);
+
+ XMVECTOR vResult = _mm_load_ps1(&g_XMASinEstCoefficients.f[1]);
+ vResult = _mm_mul_ps(vResult,V);
+ vConstants = _mm_load_ps1(&g_XMASinEstCoefficients.f[2]);
+ vConstants = _mm_mul_ps(vConstants,V2);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ vConstants = _mm_load_ps1(&g_XMASinEstCoefficients.f[3]);
+ vConstants = _mm_mul_ps(vConstants,V2);
+ vConstants = _mm_mul_ps(vConstants,D);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ vConstants = _mm_load_ps1(&g_XMASinEstCoefficients.f[0]);
+ vConstants = _mm_mul_ps(vConstants,V);
+ vConstants = _mm_mul_ps(vConstants,SqrtD);
+ vResult = _mm_add_ps(vResult,vConstants);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorACosEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR AbsV, V2, VD, VC0, V2C3;
+ XMVECTOR C0, C1, C2, C3;
+ XMVECTOR D, Rsq, SqrtD;
+ XMVECTOR OnePlusEps, HalfPi;
+ XMVECTOR Result;
+
+ // acos(V) = PI / 2 - asin(V)
+
+ AbsV = XMVectorAbs(V);
+
+ OnePlusEps = XMVectorSplatX(g_XMASinEstConstants.v);
+ HalfPi = XMVectorSplatY(g_XMASinEstConstants.v);
+
+ C0 = XMVectorSplatX(g_XMASinEstCoefficients.v);
+ C1 = XMVectorSplatY(g_XMASinEstCoefficients.v);
+ C2 = XMVectorSplatZ(g_XMASinEstCoefficients.v);
+ C3 = XMVectorSplatW(g_XMASinEstCoefficients.v);
+
+ D = XMVectorSubtract(OnePlusEps, AbsV);
+
+ Rsq = XMVectorReciprocalSqrtEst(D);
+ SqrtD = XMVectorMultiply(D, Rsq);
+
+ V2 = XMVectorMultiply(V, AbsV);
+ V2C3 = XMVectorMultiply(V2, C3);
+ VD = XMVectorMultiply(D, AbsV);
+ VC0 = XMVectorMultiply(V, C0);
+
+ Result = XMVectorMultiply(V, C1);
+ Result = XMVectorMultiplyAdd(V2, C2, Result);
+ Result = XMVectorMultiplyAdd(V2C3, VD, Result);
+ Result = XMVectorMultiplyAdd(VC0, SqrtD, Result);
+ Result = XMVectorSubtract(HalfPi, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // acos(V) = PI / 2 - asin(V)
+ // Get abs(V)
+ XMVECTOR vAbsV = _mm_setzero_ps();
+ vAbsV = _mm_sub_ps(vAbsV,V);
+ vAbsV = _mm_max_ps(vAbsV,V);
+ // Calc D
+ XMVECTOR D = _mm_load_ps1(&g_XMASinEstConstants.f[0]);
+ D = _mm_sub_ps(D,vAbsV);
+ // SqrtD = sqrt(D-abs(V)) estimated
+ XMVECTOR vConstants = _mm_rsqrt_ps(D);
+ XMVECTOR SqrtD = _mm_mul_ps(D,vConstants);
+ // V2 = V^2 while retaining sign
+ XMVECTOR V2 = _mm_mul_ps(V, vAbsV);
+ // Drop vAbsV here. D = (Const-abs(V))*abs(V)
+ D = _mm_mul_ps(D, vAbsV);
+
+ XMVECTOR vResult = _mm_load_ps1(&g_XMASinEstCoefficients.f[1]);
+ vResult = _mm_mul_ps(vResult,V);
+ vConstants = _mm_load_ps1(&g_XMASinEstCoefficients.f[2]);
+ vConstants = _mm_mul_ps(vConstants,V2);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ vConstants = _mm_load_ps1(&g_XMASinEstCoefficients.f[3]);
+ vConstants = _mm_mul_ps(vConstants,V2);
+ vConstants = _mm_mul_ps(vConstants,D);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ vConstants = _mm_load_ps1(&g_XMASinEstCoefficients.f[0]);
+ vConstants = _mm_mul_ps(vConstants,V);
+ vConstants = _mm_mul_ps(vConstants,SqrtD);
+ vResult = _mm_add_ps(vResult,vConstants);
+
+ vConstants = _mm_load_ps1(&g_XMASinEstConstants.f[1]);
+ vResult = _mm_sub_ps(vConstants,vResult);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorATanEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR AbsV, V2S2, N, D;
+ XMVECTOR S0, S1, S2;
+ XMVECTOR HalfPi;
+ XMVECTOR Result;
+
+ S0 = XMVectorSplatX(g_XMATanEstCoefficients.v);
+ S1 = XMVectorSplatY(g_XMATanEstCoefficients.v);
+ S2 = XMVectorSplatZ(g_XMATanEstCoefficients.v);
+ HalfPi = XMVectorSplatW(g_XMATanEstCoefficients.v);
+
+ AbsV = XMVectorAbs(V);
+
+ V2S2 = XMVectorMultiplyAdd(V, V, S2);
+ N = XMVectorMultiplyAdd(AbsV, HalfPi, S0);
+ D = XMVectorMultiplyAdd(AbsV, S1, V2S2);
+ N = XMVectorMultiply(N, V);
+ D = XMVectorReciprocalEst(D);
+
+ Result = XMVectorMultiply(N, D);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Get abs(V)
+ XMVECTOR vAbsV = _mm_setzero_ps();
+ vAbsV = _mm_sub_ps(vAbsV,V);
+ vAbsV = _mm_max_ps(vAbsV,V);
+
+ XMVECTOR vResult = _mm_load_ps1(&g_XMATanEstCoefficients.f[3]);
+ vResult = _mm_mul_ps(vResult,vAbsV);
+ XMVECTOR vConstants = _mm_load_ps1(&g_XMATanEstCoefficients.f[0]);
+ vResult = _mm_add_ps(vResult,vConstants);
+ vResult = _mm_mul_ps(vResult,V);
+
+ XMVECTOR D = _mm_mul_ps(V,V);
+ vConstants = _mm_load_ps1(&g_XMATanEstCoefficients.f[2]);
+ D = _mm_add_ps(D,vConstants);
+ vConstants = _mm_load_ps1(&g_XMATanEstCoefficients.f[1]);
+ vConstants = _mm_mul_ps(vConstants,vAbsV);
+ D = _mm_add_ps(D,vConstants);
+ vResult = _mm_div_ps(vResult,D);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorATan2Est
+(
+ FXMVECTOR Y,
+ FXMVECTOR X
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Reciprocal;
+ XMVECTOR V;
+ XMVECTOR YSign;
+ XMVECTOR Pi, PiOverTwo, PiOverFour, ThreePiOverFour;
+ XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity;
+ XMVECTOR ATanResultValid;
+ XMVECTOR R0, R1, R2, R3, R4, R5;
+ XMVECTOR Zero;
+ XMVECTOR Result;
+ static CONST XMVECTOR ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
+
+ Zero = XMVectorZero();
+ ATanResultValid = XMVectorTrueInt();
+
+ Pi = XMVectorSplatX(ATan2Constants);
+ PiOverTwo = XMVectorSplatY(ATan2Constants);
+ PiOverFour = XMVectorSplatZ(ATan2Constants);
+ ThreePiOverFour = XMVectorSplatW(ATan2Constants);
+
+ YEqualsZero = XMVectorEqual(Y, Zero);
+ XEqualsZero = XMVectorEqual(X, Zero);
+ XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
+ XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
+ YEqualsInfinity = XMVectorIsInfinite(Y);
+ XEqualsInfinity = XMVectorIsInfinite(X);
+
+ YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
+ Pi = XMVectorOrInt(Pi, YSign);
+ PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
+ PiOverFour = XMVectorOrInt(PiOverFour, YSign);
+ ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
+
+ R1 = XMVectorSelect(Pi, YSign, XIsPositive);
+ R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
+ R3 = XMVectorSelect(R2, R1, YEqualsZero);
+ R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
+ R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
+ Result = XMVectorSelect(R3, R5, YEqualsInfinity);
+ ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
+
+ Reciprocal = XMVectorReciprocalEst(X);
+ V = XMVectorMultiply(Y, Reciprocal);
+ R0 = XMVectorATanEst(V);
+
+ R1 = XMVectorSelect( Pi, Zero, XIsPositive );
+ R2 = XMVectorAdd(R0, R1);
+
+ Result = XMVectorSelect(Result, R2, ATanResultValid);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static CONST XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
+
+ // Mask if Y>0 && Y!=INF
+ XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
+ // Get the sign of (Y&0x80000000)
+ XMVECTOR YSign = _mm_and_ps(Y, g_XMNegativeZero);
+ // Get the sign bits of X
+ XMVECTOR XIsPositive = _mm_and_ps(X,g_XMNegativeZero);
+ // Change them to masks
+ XIsPositive = XMVectorEqualInt(XIsPositive,g_XMZero);
+ // Get Pi
+ XMVECTOR Pi = _mm_load_ps1(&ATan2Constants.f[0]);
+ // Copy the sign of Y
+ Pi = _mm_or_ps(Pi,YSign);
+ XMVECTOR R1 = XMVectorSelect(Pi,YSign,XIsPositive);
+ // Mask for X==0
+ XMVECTOR vConstants = _mm_cmpeq_ps(X,g_XMZero);
+ // Get Pi/2 with with sign of Y
+ XMVECTOR PiOverTwo = _mm_load_ps1(&ATan2Constants.f[1]);
+ PiOverTwo = _mm_or_ps(PiOverTwo,YSign);
+ XMVECTOR R2 = XMVectorSelect(g_XMNegOneMask,PiOverTwo,vConstants);
+ // Mask for Y==0
+ vConstants = _mm_cmpeq_ps(Y,g_XMZero);
+ R2 = XMVectorSelect(R2,R1,vConstants);
+ // Get Pi/4 with sign of Y
+ XMVECTOR PiOverFour = _mm_load_ps1(&ATan2Constants.f[2]);
+ PiOverFour = _mm_or_ps(PiOverFour,YSign);
+ // Get (Pi*3)/4 with sign of Y
+ XMVECTOR ThreePiOverFour = _mm_load_ps1(&ATan2Constants.f[3]);
+ ThreePiOverFour = _mm_or_ps(ThreePiOverFour,YSign);
+ vConstants = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
+ XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
+ vConstants = XMVectorSelect(PiOverTwo,vConstants,XEqualsInfinity);
+
+ XMVECTOR vResult = XMVectorSelect(R2,vConstants,YEqualsInfinity);
+ vConstants = XMVectorSelect(R1,vResult,YEqualsInfinity);
+ // At this point, any entry that's zero will get the result
+ // from XMVectorATan(), otherwise, return the failsafe value
+ vResult = XMVectorSelect(vResult,vConstants,XEqualsInfinity);
+ // Any entries not 0xFFFFFFFF, are considered precalculated
+ XMVECTOR ATanResultValid = XMVectorEqualInt(vResult,g_XMNegOneMask);
+ // Let's do the ATan2 function
+ XMVECTOR Reciprocal = _mm_rcp_ps(X);
+ vConstants = _mm_mul_ps(Y, Reciprocal);
+ vConstants = XMVectorATanEst(vConstants);
+ // Discard entries that have been declared void
+
+ XMVECTOR R3 = XMVectorSelect( Pi, g_XMZero, XIsPositive );
+ vConstants = _mm_add_ps( vConstants, R3 );
+
+ vResult = XMVectorSelect(vResult,vConstants,ATanResultValid);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorLerp
+(
+ FXMVECTOR V0,
+ FXMVECTOR V1,
+ FLOAT t
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Scale;
+ XMVECTOR Length;
+ XMVECTOR Result;
+
+ // V0 + t * (V1 - V0)
+ Scale = XMVectorReplicate(t);
+ Length = XMVectorSubtract(V1, V0);
+ Result = XMVectorMultiplyAdd(Length, Scale, V0);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR L, S;
+ XMVECTOR Result;
+
+ L = _mm_sub_ps( V1, V0 );
+
+ S = _mm_set_ps1( t );
+
+ Result = _mm_mul_ps( L, S );
+
+ return _mm_add_ps( Result, V0 );
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorLerpV
+(
+ FXMVECTOR V0,
+ FXMVECTOR V1,
+ FXMVECTOR T
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Length;
+ XMVECTOR Result;
+
+ // V0 + T * (V1 - V0)
+ Length = XMVectorSubtract(V1, V0);
+ Result = XMVectorMultiplyAdd(Length, T, V0);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR Length;
+ XMVECTOR Result;
+
+ Length = _mm_sub_ps( V1, V0 );
+
+ Result = _mm_mul_ps( Length, T );
+
+ return _mm_add_ps( Result, V0 );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorHermite
+(
+ FXMVECTOR Position0,
+ FXMVECTOR Tangent0,
+ FXMVECTOR Position1,
+ CXMVECTOR Tangent1,
+ FLOAT t
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR P0;
+ XMVECTOR T0;
+ XMVECTOR P1;
+ XMVECTOR T1;
+ XMVECTOR Result;
+ FLOAT t2;
+ FLOAT t3;
+
+ // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
+ // (t^3 - 2 * t^2 + t) * Tangent0 +
+ // (-2 * t^3 + 3 * t^2) * Position1 +
+ // (t^3 - t^2) * Tangent1
+ t2 = t * t;
+ t3 = t * t2;
+
+ P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f);
+ T0 = XMVectorReplicate(t3 - 2.0f * t2 + t);
+ P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2);
+ T1 = XMVectorReplicate(t3 - t2);
+
+ Result = XMVectorMultiply(P0, Position0);
+ Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
+ Result = XMVectorMultiplyAdd(P1, Position1, Result);
+ Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ FLOAT t2 = t * t;
+ FLOAT t3 = t * t2;
+
+ XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f);
+ XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t);
+ XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2);
+ XMVECTOR T1 = _mm_set_ps1(t3 - t2);
+
+ XMVECTOR vResult = _mm_mul_ps(P0, Position0);
+ XMVECTOR vTemp = _mm_mul_ps(T0, Tangent0);
+ vResult = _mm_add_ps(vResult,vTemp);
+ vTemp = _mm_mul_ps(P1, Position1);
+ vResult = _mm_add_ps(vResult,vTemp);
+ vTemp = _mm_mul_ps(T1, Tangent1);
+ vResult = _mm_add_ps(vResult,vTemp);
+ return vResult;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorHermiteV
+(
+ FXMVECTOR Position0,
+ FXMVECTOR Tangent0,
+ FXMVECTOR Position1,
+ CXMVECTOR Tangent1,
+ CXMVECTOR T
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR P0;
+ XMVECTOR T0;
+ XMVECTOR P1;
+ XMVECTOR T1;
+ XMVECTOR Result;
+ XMVECTOR T2;
+ XMVECTOR T3;
+
+ // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
+ // (t^3 - 2 * t^2 + t) * Tangent0 +
+ // (-2 * t^3 + 3 * t^2) * Position1 +
+ // (t^3 - t^2) * Tangent1
+ T2 = XMVectorMultiply(T, T);
+ T3 = XMVectorMultiply(T , T2);
+
+ P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f);
+ T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]);
+ P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]);
+ T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]);
+
+ Result = XMVectorMultiply(P0, Position0);
+ Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
+ Result = XMVectorMultiplyAdd(P1, Position1, Result);
+ Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f};
+ static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f};
+
+ // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
+ // (t^3 - 2 * t^2 + t) * Tangent0 +
+ // (-2 * t^3 + 3 * t^2) * Position1 +
+ // (t^3 - t^2) * Tangent1
+ XMVECTOR T2 = _mm_mul_ps(T,T);
+ XMVECTOR T3 = _mm_mul_ps(T,T2);
+ // Mul by the constants against t^2
+ T2 = _mm_mul_ps(T2,CatMulT2);
+ // Mul by the constants against t^3
+ T3 = _mm_mul_ps(T3,CatMulT3);
+ // T3 now has the pre-result.
+ T3 = _mm_add_ps(T3,T2);
+ // I need to add t.y only
+ T2 = _mm_and_ps(T,g_XMMaskY);
+ T3 = _mm_add_ps(T3,T2);
+ // Add 1.0f to x
+ T3 = _mm_add_ps(T3,g_XMIdentityR0);
+ // Now, I have the constants created
+ // Mul the x constant to Position0
+ XMVECTOR vResult = _mm_shuffle_ps(T3,T3,_MM_SHUFFLE(0,0,0,0));
+ vResult = _mm_mul_ps(vResult,Position0);
+ // Mul the y constant to Tangent0
+ T2 = _mm_shuffle_ps(T3,T3,_MM_SHUFFLE(1,1,1,1));
+ T2 = _mm_mul_ps(T2,Tangent0);
+ vResult = _mm_add_ps(vResult,T2);
+ // Mul the z constant to Position1
+ T2 = _mm_shuffle_ps(T3,T3,_MM_SHUFFLE(2,2,2,2));
+ T2 = _mm_mul_ps(T2,Position1);
+ vResult = _mm_add_ps(vResult,T2);
+ // Mul the w constant to Tangent1
+ T3 = _mm_shuffle_ps(T3,T3,_MM_SHUFFLE(3,3,3,3));
+ T3 = _mm_mul_ps(T3,Tangent1);
+ vResult = _mm_add_ps(vResult,T3);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorCatmullRom
+(
+ FXMVECTOR Position0,
+ FXMVECTOR Position1,
+ FXMVECTOR Position2,
+ CXMVECTOR Position3,
+ FLOAT t
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR P0;
+ XMVECTOR P1;
+ XMVECTOR P2;
+ XMVECTOR P3;
+ XMVECTOR Result;
+ FLOAT t2;
+ FLOAT t3;
+
+ // Result = ((-t^3 + 2 * t^2 - t) * Position0 +
+ // (3 * t^3 - 5 * t^2 + 2) * Position1 +
+ // (-3 * t^3 + 4 * t^2 + t) * Position2 +
+ // (t^3 - t^2) * Position3) * 0.5
+ t2 = t * t;
+ t3 = t * t2;
+
+ P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f);
+ P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
+ P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
+ P3 = XMVectorReplicate((t3 - t2) * 0.5f);
+
+ Result = XMVectorMultiply(P0, Position0);
+ Result = XMVectorMultiplyAdd(P1, Position1, Result);
+ Result = XMVectorMultiplyAdd(P2, Position2, Result);
+ Result = XMVectorMultiplyAdd(P3, Position3, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ FLOAT t2 = t * t;
+ FLOAT t3 = t * t2;
+
+ XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f);
+ XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
+ XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
+ XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f);
+
+ P0 = _mm_mul_ps(P0, Position0);
+ P1 = _mm_mul_ps(P1, Position1);
+ P2 = _mm_mul_ps(P2, Position2);
+ P3 = _mm_mul_ps(P3, Position3);
+ P0 = _mm_add_ps(P0,P1);
+ P2 = _mm_add_ps(P2,P3);
+ P0 = _mm_add_ps(P0,P2);
+ return P0;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorCatmullRomV
+(
+ FXMVECTOR Position0,
+ FXMVECTOR Position1,
+ FXMVECTOR Position2,
+ CXMVECTOR Position3,
+ CXMVECTOR T
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ float fx = T.vector4_f32[0];
+ float fy = T.vector4_f32[1];
+ float fz = T.vector4_f32[2];
+ float fw = T.vector4_f32[3];
+ XMVECTOR vResult = {
+ 0.5f*((-fx*fx*fx+2*fx*fx-fx)*Position0.vector4_f32[0]+
+ (3*fx*fx*fx-5*fx*fx+2)*Position1.vector4_f32[0]+
+ (-3*fx*fx*fx+4*fx*fx+fx)*Position2.vector4_f32[0]+
+ (fx*fx*fx-fx*fx)*Position3.vector4_f32[0]),
+ 0.5f*((-fy*fy*fy+2*fy*fy-fy)*Position0.vector4_f32[1]+
+ (3*fy*fy*fy-5*fy*fy+2)*Position1.vector4_f32[1]+
+ (-3*fy*fy*fy+4*fy*fy+fy)*Position2.vector4_f32[1]+
+ (fy*fy*fy-fy*fy)*Position3.vector4_f32[1]),
+ 0.5f*((-fz*fz*fz+2*fz*fz-fz)*Position0.vector4_f32[2]+
+ (3*fz*fz*fz-5*fz*fz+2)*Position1.vector4_f32[2]+
+ (-3*fz*fz*fz+4*fz*fz+fz)*Position2.vector4_f32[2]+
+ (fz*fz*fz-fz*fz)*Position3.vector4_f32[2]),
+ 0.5f*((-fw*fw*fw+2*fw*fw-fw)*Position0.vector4_f32[3]+
+ (3*fw*fw*fw-5*fw*fw+2)*Position1.vector4_f32[3]+
+ (-3*fw*fw*fw+4*fw*fw+fw)*Position2.vector4_f32[3]+
+ (fw*fw*fw-fw*fw)*Position3.vector4_f32[3])
+ };
+ return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f};
+ static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f};
+ static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f};
+ static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f};
+ // Cache T^2 and T^3
+ XMVECTOR T2 = _mm_mul_ps(T,T);
+ XMVECTOR T3 = _mm_mul_ps(T,T2);
+ // Perform the Position0 term
+ XMVECTOR vResult = _mm_add_ps(T2,T2);
+ vResult = _mm_sub_ps(vResult,T);
+ vResult = _mm_sub_ps(vResult,T3);
+ vResult = _mm_mul_ps(vResult,Position0);
+ // Perform the Position1 term and add
+ XMVECTOR vTemp = _mm_mul_ps(T3,Catmul3);
+ XMVECTOR vTemp2 = _mm_mul_ps(T2,Catmul5);
+ vTemp = _mm_sub_ps(vTemp,vTemp2);
+ vTemp = _mm_add_ps(vTemp,Catmul2);
+ vTemp = _mm_mul_ps(vTemp,Position1);
+ vResult = _mm_add_ps(vResult,vTemp);
+ // Perform the Position2 term and add
+ vTemp = _mm_mul_ps(T2,Catmul4);
+ vTemp2 = _mm_mul_ps(T3,Catmul3);
+ vTemp = _mm_sub_ps(vTemp,vTemp2);
+ vTemp = _mm_add_ps(vTemp,T);
+ vTemp = _mm_mul_ps(vTemp,Position2);
+ vResult = _mm_add_ps(vResult,vTemp);
+ // Position3 is the last term
+ T3 = _mm_sub_ps(T3,T2);
+ T3 = _mm_mul_ps(T3,Position3);
+ vResult = _mm_add_ps(vResult,T3);
+ // Multiply by 0.5f and exit
+ vResult = _mm_mul_ps(vResult,g_XMOneHalf);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorBaryCentric
+(
+ FXMVECTOR Position0,
+ FXMVECTOR Position1,
+ FXMVECTOR Position2,
+ FLOAT f,
+ FLOAT g
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)
+ XMVECTOR P10;
+ XMVECTOR P20;
+ XMVECTOR ScaleF;
+ XMVECTOR ScaleG;
+ XMVECTOR Result;
+
+ P10 = XMVectorSubtract(Position1, Position0);
+ ScaleF = XMVectorReplicate(f);
+
+ P20 = XMVectorSubtract(Position2, Position0);
+ ScaleG = XMVectorReplicate(g);
+
+ Result = XMVectorMultiplyAdd(P10, ScaleF, Position0);
+ Result = XMVectorMultiplyAdd(P20, ScaleG, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR R1 = _mm_sub_ps(Position1,Position0);
+ XMVECTOR SF = _mm_set_ps1(f);
+ XMVECTOR R2 = _mm_sub_ps(Position2,Position0);
+ XMVECTOR SG = _mm_set_ps1(g);
+ R1 = _mm_mul_ps(R1,SF);
+ R2 = _mm_mul_ps(R2,SG);
+ R1 = _mm_add_ps(R1,Position0);
+ R1 = _mm_add_ps(R1,R2);
+ return R1;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVectorBaryCentricV
+(
+ FXMVECTOR Position0,
+ FXMVECTOR Position1,
+ FXMVECTOR Position2,
+ CXMVECTOR F,
+ CXMVECTOR G
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)
+ XMVECTOR P10;
+ XMVECTOR P20;
+ XMVECTOR Result;
+
+ P10 = XMVectorSubtract(Position1, Position0);
+ P20 = XMVectorSubtract(Position2, Position0);
+
+ Result = XMVectorMultiplyAdd(P10, F, Position0);
+ Result = XMVectorMultiplyAdd(P20, G, Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR R1 = _mm_sub_ps(Position1,Position0);
+ XMVECTOR R2 = _mm_sub_ps(Position2,Position0);
+ R1 = _mm_mul_ps(R1,F);
+ R2 = _mm_mul_ps(R2,G);
+ R1 = _mm_add_ps(R1,Position0);
+ R1 = _mm_add_ps(R1,R2);
+ return R1;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+/****************************************************************************
+ *
+ * 2D Vector
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector2Equal
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+// z and w are don't care
+ return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAllTrue(XMVector2EqualR(V1, V2));
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector2EqualR
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ UINT CR = 0;
+
+ if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] == V2.vector4_f32[1]))
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] != V2.vector4_f32[1]))
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+// z and w are don't care
+ int iTest = _mm_movemask_ps(vTemp)&3;
+ UINT CR = 0;
+ if (iTest==3)
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!iTest)
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector2EqualInt
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast<const __m128i *>(&V1)[0],reinterpret_cast<const __m128i *>(&V2)[0]);
+ return (((_mm_movemask_ps(reinterpret_cast<const __m128 *>(&vTemp)[0])&3)==3) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAllTrue(XMVector2EqualIntR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector2EqualIntR
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ UINT CR = 0;
+ if ((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
+ (V1.vector4_u32[1] == V2.vector4_u32[1]))
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) &&
+ (V1.vector4_u32[1] != V2.vector4_u32[1]))
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast<const __m128i *>(&V1)[0],reinterpret_cast<const __m128i *>(&V2)[0]);
+ int iTest = _mm_movemask_ps(reinterpret_cast<const __m128 *>(&vTemp)[0])&3;
+ UINT CR = 0;
+ if (iTest==3)
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!iTest)
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector2NearEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2,
+ FXMVECTOR Epsilon
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ FLOAT dx, dy;
+ dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
+ dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
+ return ((dx <= Epsilon.vector4_f32[0]) &&
+ (dy <= Epsilon.vector4_f32[1]));
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Get the difference
+ XMVECTOR vDelta = _mm_sub_ps(V1,V2);
+ // Get the absolute value of the difference
+ XMVECTOR vTemp = _mm_setzero_ps();
+ vTemp = _mm_sub_ps(vTemp,vDelta);
+ vTemp = _mm_max_ps(vTemp,vDelta);
+ vTemp = _mm_cmple_ps(vTemp,Epsilon);
+ // z and w are don't care
+ return (((_mm_movemask_ps(vTemp)&3)==0x3) != 0);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector2NotEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+// z and w are don't care
+ return (((_mm_movemask_ps(vTemp)&3)!=3) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAnyFalse(XMVector2EqualR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector2NotEqualInt
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast<const __m128i *>(&V1)[0],reinterpret_cast<const __m128i *>(&V2)[0]);
+ return (((_mm_movemask_ps(reinterpret_cast<const __m128 *>(&vTemp)[0])&3)!=3) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAnyFalse(XMVector2EqualIntR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector2Greater
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
+// z and w are don't care
+ return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAllTrue(XMVector2GreaterR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector2GreaterR
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ UINT CR = 0;
+ if ((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] > V2.vector4_f32[1]))
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] <= V2.vector4_f32[1]))
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
+ int iTest = _mm_movemask_ps(vTemp)&3;
+ UINT CR = 0;
+ if (iTest==3)
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!iTest)
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector2GreaterOrEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
+ return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAllTrue(XMVector2GreaterOrEqualR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector2GreaterOrEqualR
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT CR = 0;
+ if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] >= V2.vector4_f32[1]))
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] < V2.vector4_f32[1]))
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
+ int iTest = _mm_movemask_ps(vTemp)&3;
+ UINT CR = 0;
+ if (iTest == 3)
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!iTest)
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector2Less
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
+ return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAllTrue(XMVector2GreaterR(V2, V1));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector2LessOrEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
+ return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAllTrue(XMVector2GreaterOrEqualR(V2, V1));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector2InBounds
+(
+ FXMVECTOR V,
+ FXMVECTOR Bounds
+)
+{
+ #if defined(_XM_NO_INTRINSICS_)
+ return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
+ (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0);
+ #elif defined(_XM_SSE_INTRINSICS_)
+ // Test if less than or equal
+ XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
+ // Negate the bounds
+ XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
+ // Test if greater or equal (Reversed)
+ vTemp2 = _mm_cmple_ps(vTemp2,V);
+ // Blend answers
+ vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+ // x and y in bounds? (z and w are don't care)
+ return (((_mm_movemask_ps(vTemp1)&0x3)==0x3) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAllInBounds(XMVector2InBoundsR(V, Bounds));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector2InBoundsR
+(
+ FXMVECTOR V,
+ FXMVECTOR Bounds
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT CR = 0;
+ if ((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
+ (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]))
+ {
+ CR = XM_CRMASK_CR6BOUNDS;
+ }
+ return CR;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Test if less than or equal
+ XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
+ // Negate the bounds
+ XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
+ // Test if greater or equal (Reversed)
+ vTemp2 = _mm_cmple_ps(vTemp2,V);
+ // Blend answers
+ vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+ // x and y in bounds? (z and w are don't care)
+ return ((_mm_movemask_ps(vTemp1)&0x3)==0x3) ? XM_CRMASK_CR6BOUNDS : 0;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector2IsNaN
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (XMISNAN(V.vector4_f32[0]) ||
+ XMISNAN(V.vector4_f32[1]));
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Mask off the exponent
+ __m128i vTempInf = _mm_and_si128(reinterpret_cast<const __m128i *>(&V)[0],g_XMInfinity);
+ // Mask off the mantissa
+ __m128i vTempNan = _mm_and_si128(reinterpret_cast<const __m128i *>(&V)[0],g_XMQNaNTest);
+ // Are any of the exponents == 0x7F800000?
+ vTempInf = _mm_cmpeq_epi32(vTempInf,g_XMInfinity);
+ // Are any of the mantissa's zero? (SSE2 doesn't have a neq test)
+ vTempNan = _mm_cmpeq_epi32(vTempNan,g_XMZero);
+ // Perform a not on the NaN test to be true on NON-zero mantissas
+ vTempNan = _mm_andnot_si128(vTempNan,vTempInf);
+ // If x or y are NaN, the signs are true after the merge above
+ return ((_mm_movemask_ps(reinterpret_cast<const __m128 *>(&vTempNan)[0])&3) != 0);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector2IsInfinite
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ return (XMISINF(V.vector4_f32[0]) ||
+ XMISINF(V.vector4_f32[1]));
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Mask off the sign bit
+ __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
+ // Compare to infinity
+ vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
+ // If x or z are infinity, the signs are true.
+ return ((_mm_movemask_ps(vTemp)&3) != 0);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2Dot
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_f32[0] =
+ Result.vector4_f32[1] =
+ Result.vector4_f32[2] =
+ Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1];
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x and y
+ XMVECTOR vLengthSq = _mm_mul_ps(V1,V2);
+ // vTemp has y splatted
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1));
+ // x+y
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+ return vLengthSq;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2Cross
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ FLOAT fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]);
+ XMVECTOR vResult = {
+ fCross,
+ fCross,
+ fCross,
+ fCross
+ };
+ return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Swap x and y
+ XMVECTOR vResult = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(0,1,0,1));
+ // Perform the muls
+ vResult = _mm_mul_ps(vResult,V1);
+ // Splat y
+ XMVECTOR vTemp = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(1,1,1,1));
+ // Sub the values
+ vResult = _mm_sub_ss(vResult,vTemp);
+ // Splat the cross product
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,0,0,0));
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2LengthSq
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return XMVector2Dot(V, V);
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x and y
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has y splatted
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1));
+ // x+y
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+ return vLengthSq;
+#else
+ return XMVector2Dot(V, V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2ReciprocalLengthEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result = XMVector2LengthSq(V);
+ Result = XMVectorReciprocalSqrtEst(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x and y
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has y splatted
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1));
+ // x+y
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ vLengthSq = _mm_rsqrt_ss(vLengthSq);
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+ return vLengthSq;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2ReciprocalLength
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result = XMVector2LengthSq(V);
+ Result = XMVectorReciprocalSqrt(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x and y
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has y splatted
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1));
+ // x+y
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ vLengthSq = _mm_sqrt_ss(vLengthSq);
+ vLengthSq = _mm_div_ss(g_XMOne,vLengthSq);
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+ return vLengthSq;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2LengthEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR Result;
+ Result = XMVector2LengthSq(V);
+ Result = XMVectorSqrtEst(Result);
+ return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x and y
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has y splatted
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1));
+ // x+y
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ vLengthSq = _mm_sqrt_ss(vLengthSq);
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+ return vLengthSq;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2Length
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+ Result = XMVector2LengthSq(V);
+ Result = XMVectorSqrt(Result);
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x and y
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has y splatted
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1));
+ // x+y
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+ vLengthSq = _mm_sqrt_ps(vLengthSq);
+ return vLengthSq;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// XMVector2NormalizeEst uses a reciprocal estimate and
+// returns QNaN on zero and infinite vectors.
+
+XMFINLINE XMVECTOR XMVector2NormalizeEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+ Result = XMVector2ReciprocalLength(V);
+ Result = XMVectorMultiply(V, Result);
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x and y
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has y splatted
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1));
+ // x+y
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ vLengthSq = _mm_rsqrt_ss(vLengthSq);
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+ vLengthSq = _mm_mul_ps(vLengthSq,V);
+ return vLengthSq;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2Normalize
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ FLOAT fLength;
+ XMVECTOR vResult;
+
+ vResult = XMVector2Length( V );
+ fLength = vResult.vector4_f32[0];
+
+ // Prevent divide by zero
+ if (fLength > 0) {
+ fLength = 1.0f/fLength;
+ }
+
+ vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
+ vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
+ vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
+ vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x and y only
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,1,1,1));
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+ // Prepare for the division
+ XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+ // Create zero with a single instruction
+ XMVECTOR vZeroMask = _mm_setzero_ps();
+ // Test for a divide by zero (Must be FP to detect -0.0)
+ vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+ // Failsafe on zero (Or epsilon) length planes
+ // If the length is infinity, set the elements to zero
+ vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+ // Reciprocal mul to perform the normalization
+ vResult = _mm_div_ps(V,vResult);
+ // Any that are infinity, set to zero
+ vResult = _mm_and_ps(vResult,vZeroMask);
+ // Select qnan or result based on infinite length
+ XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+ XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+ vResult = _mm_or_ps(vTemp1,vTemp2);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2ClampLength
+(
+ FXMVECTOR V,
+ FLOAT LengthMin,
+ FLOAT LengthMax
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR ClampMax;
+ XMVECTOR ClampMin;
+
+ ClampMax = XMVectorReplicate(LengthMax);
+ ClampMin = XMVectorReplicate(LengthMin);
+
+ return XMVector2ClampLengthV(V, ClampMin, ClampMax);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR ClampMax = _mm_set_ps1(LengthMax);
+ XMVECTOR ClampMin = _mm_set_ps1(LengthMin);
+ return XMVector2ClampLengthV(V, ClampMin, ClampMax);
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2ClampLengthV
+(
+ FXMVECTOR V,
+ FXMVECTOR LengthMin,
+ FXMVECTOR LengthMax
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR ClampLength;
+ XMVECTOR LengthSq;
+ XMVECTOR RcpLength;
+ XMVECTOR Length;
+ XMVECTOR Normal;
+ XMVECTOR Zero;
+ XMVECTOR InfiniteLength;
+ XMVECTOR ZeroLength;
+ XMVECTOR Select;
+ XMVECTOR ControlMax;
+ XMVECTOR ControlMin;
+ XMVECTOR Control;
+ XMVECTOR Result;
+
+ XMASSERT((LengthMin.vector4_f32[1] == LengthMin.vector4_f32[0]));
+ XMASSERT((LengthMax.vector4_f32[1] == LengthMax.vector4_f32[0]));
+ XMASSERT(XMVector2GreaterOrEqual(LengthMin, XMVectorZero()));
+ XMASSERT(XMVector2GreaterOrEqual(LengthMax, XMVectorZero()));
+ XMASSERT(XMVector2GreaterOrEqual(LengthMax, LengthMin));
+
+ LengthSq = XMVector2LengthSq(V);
+
+ Zero = XMVectorZero();
+
+ RcpLength = XMVectorReciprocalSqrt(LengthSq);
+
+ InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
+ ZeroLength = XMVectorEqual(LengthSq, Zero);
+
+ Length = XMVectorMultiply(LengthSq, RcpLength);
+
+ Normal = XMVectorMultiply(V, RcpLength);
+
+ Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
+ Length = XMVectorSelect(LengthSq, Length, Select);
+ Normal = XMVectorSelect(LengthSq, Normal, Select);
+
+ ControlMax = XMVectorGreater(Length, LengthMax);
+ ControlMin = XMVectorLess(Length, LengthMin);
+
+ ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
+ ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
+
+ Result = XMVectorMultiply(Normal, ClampLength);
+
+ // Preserve the original vector (with no precision loss) if the length falls within the given range
+ Control = XMVectorEqualInt(ControlMax, ControlMin);
+ Result = XMVectorSelect(Result, V, Control);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR ClampLength;
+ XMVECTOR LengthSq;
+ XMVECTOR RcpLength;
+ XMVECTOR Length;
+ XMVECTOR Normal;
+ XMVECTOR InfiniteLength;
+ XMVECTOR ZeroLength;
+ XMVECTOR Select;
+ XMVECTOR ControlMax;
+ XMVECTOR ControlMin;
+ XMVECTOR Control;
+ XMVECTOR Result;
+
+ XMASSERT((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)));
+ XMASSERT((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)));
+ XMASSERT(XMVector2GreaterOrEqual(LengthMin, g_XMZero));
+ XMASSERT(XMVector2GreaterOrEqual(LengthMax, g_XMZero));
+ XMASSERT(XMVector2GreaterOrEqual(LengthMax, LengthMin));
+ LengthSq = XMVector2LengthSq(V);
+ RcpLength = XMVectorReciprocalSqrt(LengthSq);
+ InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity);
+ ZeroLength = XMVectorEqual(LengthSq, g_XMZero);
+ Length = _mm_mul_ps(LengthSq, RcpLength);
+ Normal = _mm_mul_ps(V, RcpLength);
+ Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
+ Length = XMVectorSelect(LengthSq, Length, Select);
+ Normal = XMVectorSelect(LengthSq, Normal, Select);
+ ControlMax = XMVectorGreater(Length, LengthMax);
+ ControlMin = XMVectorLess(Length, LengthMin);
+ ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
+ ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
+ Result = _mm_mul_ps(Normal, ClampLength);
+ // Preserve the original vector (with no precision loss) if the length falls within the given range
+ Control = XMVectorEqualInt(ControlMax, ControlMin);
+ Result = XMVectorSelect(Result, V, Control);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2Reflect
+(
+ FXMVECTOR Incident,
+ FXMVECTOR Normal
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ // Result = Incident - (2 * dot(Incident, Normal)) * Normal
+ Result = XMVector2Dot(Incident, Normal);
+ Result = XMVectorAdd(Result, Result);
+ Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Result = Incident - (2 * dot(Incident, Normal)) * Normal
+ XMVECTOR Result = XMVector2Dot(Incident,Normal);
+ Result = _mm_add_ps(Result, Result);
+ Result = _mm_mul_ps(Result, Normal);
+ Result = _mm_sub_ps(Incident,Result);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2Refract
+(
+ FXMVECTOR Incident,
+ FXMVECTOR Normal,
+ FLOAT RefractionIndex
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR Index;
+ Index = XMVectorReplicate(RefractionIndex);
+ return XMVector2RefractV(Incident, Normal, Index);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR Index = _mm_set_ps1(RefractionIndex);
+ return XMVector2RefractV(Incident,Normal,Index);
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+// Return the refraction of a 2D vector
+XMFINLINE XMVECTOR XMVector2RefractV
+(
+ FXMVECTOR Incident,
+ FXMVECTOR Normal,
+ FXMVECTOR RefractionIndex
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ float IDotN;
+ float RX,RY;
+ XMVECTOR vResult;
+ // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
+ // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+ IDotN = (Incident.vector4_f32[0]*Normal.vector4_f32[0])+(Incident.vector4_f32[1]*Normal.vector4_f32[1]);
+ // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+ RY = 1.0f-(IDotN*IDotN);
+ RX = 1.0f-(RY*RefractionIndex.vector4_f32[0]*RefractionIndex.vector4_f32[0]);
+ RY = 1.0f-(RY*RefractionIndex.vector4_f32[1]*RefractionIndex.vector4_f32[1]);
+ if (RX>=0.0f) {
+ RX = (RefractionIndex.vector4_f32[0]*Incident.vector4_f32[0])-(Normal.vector4_f32[0]*((RefractionIndex.vector4_f32[0]*IDotN)+sqrtf(RX)));
+ } else {
+ RX = 0.0f;
+ }
+ if (RY>=0.0f) {
+ RY = (RefractionIndex.vector4_f32[1]*Incident.vector4_f32[1])-(Normal.vector4_f32[1]*((RefractionIndex.vector4_f32[1]*IDotN)+sqrtf(RY)));
+ } else {
+ RY = 0.0f;
+ }
+ vResult.vector4_f32[0] = RX;
+ vResult.vector4_f32[1] = RY;
+ vResult.vector4_f32[2] = 0.0f;
+ vResult.vector4_f32[3] = 0.0f;
+ return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
+ // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+ // Get the 2D Dot product of Incident-Normal
+ XMVECTOR IDotN = _mm_mul_ps(Incident,Normal);
+ XMVECTOR vTemp = _mm_shuffle_ps(IDotN,IDotN,_MM_SHUFFLE(1,1,1,1));
+ IDotN = _mm_add_ss(IDotN,vTemp);
+ IDotN = _mm_shuffle_ps(IDotN,IDotN,_MM_SHUFFLE(0,0,0,0));
+ // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+ vTemp = _mm_mul_ps(IDotN,IDotN);
+ vTemp = _mm_sub_ps(g_XMOne,vTemp);
+ vTemp = _mm_mul_ps(vTemp,RefractionIndex);
+ vTemp = _mm_mul_ps(vTemp,RefractionIndex);
+ vTemp = _mm_sub_ps(g_XMOne,vTemp);
+ // If any terms are <=0, sqrt() will fail, punt to zero
+ XMVECTOR vMask = _mm_cmpgt_ps(vTemp,g_XMZero);
+ // R = RefractionIndex * IDotN + sqrt(R)
+ vTemp = _mm_sqrt_ps(vTemp);
+ XMVECTOR vResult = _mm_mul_ps(RefractionIndex,IDotN);
+ vTemp = _mm_add_ps(vTemp,vResult);
+ // Result = RefractionIndex * Incident - Normal * R
+ vResult = _mm_mul_ps(RefractionIndex,Incident);
+ vTemp = _mm_mul_ps(vTemp,Normal);
+ vResult = _mm_sub_ps(vResult,vTemp);
+ vResult = _mm_and_ps(vResult,vMask);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2Orthogonal
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_f32[0] = -V.vector4_f32[1];
+ Result.vector4_f32[1] = V.vector4_f32[0];
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,2,0,1));
+ vResult = _mm_mul_ps(vResult,g_XMNegateX);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2AngleBetweenNormalsEst
+(
+ FXMVECTOR N1,
+ FXMVECTOR N2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR NegativeOne;
+ XMVECTOR One;
+ XMVECTOR Result;
+
+ Result = XMVector2Dot(N1, N2);
+ NegativeOne = XMVectorSplatConstant(-1, 0);
+ One = XMVectorSplatOne();
+ Result = XMVectorClamp(Result, NegativeOne, One);
+ Result = XMVectorACosEst(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = XMVector2Dot(N1,N2);
+ // Clamp to -1.0f to 1.0f
+ vResult = _mm_max_ps(vResult,g_XMNegativeOne);
+ vResult = _mm_min_ps(vResult,g_XMOne);;
+ vResult = XMVectorACosEst(vResult);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2AngleBetweenNormals
+(
+ FXMVECTOR N1,
+ FXMVECTOR N2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR NegativeOne;
+ XMVECTOR One;
+ XMVECTOR Result;
+
+ Result = XMVector2Dot(N1, N2);
+ NegativeOne = XMVectorSplatConstant(-1, 0);
+ One = XMVectorSplatOne();
+ Result = XMVectorClamp(Result, NegativeOne, One);
+ Result = XMVectorACos(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = XMVector2Dot(N1,N2);
+ // Clamp to -1.0f to 1.0f
+ vResult = _mm_max_ps(vResult,g_XMNegativeOne);
+ vResult = _mm_min_ps(vResult,g_XMOne);;
+ vResult = XMVectorACos(vResult);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2AngleBetweenVectors
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR L1;
+ XMVECTOR L2;
+ XMVECTOR Dot;
+ XMVECTOR CosAngle;
+ XMVECTOR NegativeOne;
+ XMVECTOR One;
+ XMVECTOR Result;
+
+ L1 = XMVector2ReciprocalLength(V1);
+ L2 = XMVector2ReciprocalLength(V2);
+
+ Dot = XMVector2Dot(V1, V2);
+
+ L1 = XMVectorMultiply(L1, L2);
+
+ CosAngle = XMVectorMultiply(Dot, L1);
+ NegativeOne = XMVectorSplatConstant(-1, 0);
+ One = XMVectorSplatOne();
+ CosAngle = XMVectorClamp(CosAngle, NegativeOne, One);
+
+ Result = XMVectorACos(CosAngle);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR L1;
+ XMVECTOR L2;
+ XMVECTOR Dot;
+ XMVECTOR CosAngle;
+ XMVECTOR Result;
+ L1 = XMVector2ReciprocalLength(V1);
+ L2 = XMVector2ReciprocalLength(V2);
+ Dot = XMVector2Dot(V1, V2);
+ L1 = _mm_mul_ps(L1, L2);
+ CosAngle = _mm_mul_ps(Dot, L1);
+ CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne,g_XMOne);
+ Result = XMVectorACos(CosAngle);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2LinePointDistance
+(
+ FXMVECTOR LinePoint1,
+ FXMVECTOR LinePoint2,
+ FXMVECTOR Point
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR PointVector;
+ XMVECTOR LineVector;
+ XMVECTOR ReciprocalLengthSq;
+ XMVECTOR PointProjectionScale;
+ XMVECTOR DistanceVector;
+ XMVECTOR Result;
+
+ // Given a vector PointVector from LinePoint1 to Point and a vector
+ // LineVector from LinePoint1 to LinePoint2, the scaled distance
+ // PointProjectionScale from LinePoint1 to the perpendicular projection
+ // of PointVector onto the line is defined as:
+ //
+ // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)
+
+ PointVector = XMVectorSubtract(Point, LinePoint1);
+ LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
+
+ ReciprocalLengthSq = XMVector2LengthSq(LineVector);
+ ReciprocalLengthSq = XMVectorReciprocal(ReciprocalLengthSq);
+
+ PointProjectionScale = XMVector2Dot(PointVector, LineVector);
+ PointProjectionScale = XMVectorMultiply(PointProjectionScale, ReciprocalLengthSq);
+
+ DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
+ DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
+
+ Result = XMVector2Length(DistanceVector);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR PointVector = _mm_sub_ps(Point,LinePoint1);
+ XMVECTOR LineVector = _mm_sub_ps(LinePoint2,LinePoint1);
+ XMVECTOR ReciprocalLengthSq = XMVector2LengthSq(LineVector);
+ XMVECTOR vResult = XMVector2Dot(PointVector,LineVector);
+ vResult = _mm_div_ps(vResult,ReciprocalLengthSq);
+ vResult = _mm_mul_ps(vResult,LineVector);
+ vResult = _mm_sub_ps(PointVector,vResult);
+ vResult = XMVector2Length(vResult);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2IntersectLine
+(
+ FXMVECTOR Line1Point1,
+ FXMVECTOR Line1Point2,
+ FXMVECTOR Line2Point1,
+ CXMVECTOR Line2Point2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V1;
+ XMVECTOR V2;
+ XMVECTOR V3;
+ XMVECTOR C1;
+ XMVECTOR C2;
+ XMVECTOR Result;
+ CONST XMVECTOR Zero = XMVectorZero();
+
+ V1 = XMVectorSubtract(Line1Point2, Line1Point1);
+ V2 = XMVectorSubtract(Line2Point2, Line2Point1);
+ V3 = XMVectorSubtract(Line1Point1, Line2Point1);
+
+ C1 = XMVector2Cross(V1, V2);
+ C2 = XMVector2Cross(V2, V3);
+
+ if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v))
+ {
+ if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v))
+ {
+ // Coincident
+ Result = g_XMInfinity.v;
+ }
+ else
+ {
+ // Parallel
+ Result = g_XMQNaN.v;
+ }
+ }
+ else
+ {
+ // Intersection point = Line1Point1 + V1 * (C2 / C1)
+ XMVECTOR Scale;
+ Scale = XMVectorReciprocal(C1);
+ Scale = XMVectorMultiply(C2, Scale);
+ Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1);
+ }
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1);
+ XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1);
+ XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1);
+ // Generate the cross products
+ XMVECTOR C1 = XMVector2Cross(V1, V2);
+ XMVECTOR C2 = XMVector2Cross(V2, V3);
+ // If C1 is not close to epsilon, use the calculated value
+ XMVECTOR vResultMask = _mm_setzero_ps();
+ vResultMask = _mm_sub_ps(vResultMask,C1);
+ vResultMask = _mm_max_ps(vResultMask,C1);
+ // 0xFFFFFFFF if the calculated value is to be used
+ vResultMask = _mm_cmpgt_ps(vResultMask,g_XMEpsilon);
+ // If C1 is close to epsilon, which fail type is it? INFINITY or NAN?
+ XMVECTOR vFailMask = _mm_setzero_ps();
+ vFailMask = _mm_sub_ps(vFailMask,C2);
+ vFailMask = _mm_max_ps(vFailMask,C2);
+ vFailMask = _mm_cmple_ps(vFailMask,g_XMEpsilon);
+ XMVECTOR vFail = _mm_and_ps(vFailMask,g_XMInfinity);
+ vFailMask = _mm_andnot_ps(vFailMask,g_XMQNaN);
+ // vFail is NAN or INF
+ vFail = _mm_or_ps(vFail,vFailMask);
+ // Intersection point = Line1Point1 + V1 * (C2 / C1)
+ XMVECTOR vResult = _mm_div_ps(C2,C1);
+ vResult = _mm_mul_ps(vResult,V1);
+ vResult = _mm_add_ps(vResult,Line1Point1);
+ // Use result, or failure value
+ vResult = _mm_and_ps(vResult,vResultMask);
+ vResultMask = _mm_andnot_ps(vResultMask,vFail);
+ vResult = _mm_or_ps(vResult,vResultMask);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2Transform
+(
+ FXMVECTOR V,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR X;
+ XMVECTOR Y;
+ XMVECTOR Result;
+
+ Y = XMVectorSplatY(V);
+ X = XMVectorSplatX(V);
+
+ Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
+ Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,0,0,0));
+ vResult = _mm_mul_ps(vResult,M.r[0]);
+ XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+ vTemp = _mm_mul_ps(vTemp,M.r[1]);
+ vResult = _mm_add_ps(vResult,vTemp);
+ vResult = _mm_add_ps(vResult,M.r[3]);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMFLOAT4* XMVector2TransformStream
+(
+ XMFLOAT4* pOutputStream,
+ size_t OutputStride,
+ CONST XMFLOAT2* pInputStream,
+ size_t InputStride,
+ size_t VectorCount,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ XMVECTOR X;
+ XMVECTOR Y;
+ XMVECTOR Result;
+ size_t i;
+ CONST BYTE* pInputVector = (CONST BYTE*)pInputStream;
+ BYTE* pOutputVector = (BYTE*)pOutputStream;
+
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
+ Y = XMVectorSplatY(V);
+ X = XMVectorSplatX(V);
+// Y = XMVectorReplicate(((XMFLOAT2*)pInputVector)->y);
+// X = XMVectorReplicate(((XMFLOAT2*)pInputVector)->x);
+
+ Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
+ Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+ XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
+
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+
+ return pOutputStream;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+ size_t i;
+ CONST BYTE* pInputVector = (CONST BYTE*)pInputStream;
+ BYTE* pOutputVector = (BYTE*)pOutputStream;
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ XMVECTOR X = _mm_load_ps1(&reinterpret_cast<const XMFLOAT2*>(pInputVector)->x);
+ XMVECTOR vResult = _mm_load_ps1(&reinterpret_cast<const XMFLOAT2*>(pInputVector)->y);
+ vResult = _mm_mul_ps(vResult,M.r[1]);
+ vResult = _mm_add_ps(vResult,M.r[3]);
+ X = _mm_mul_ps(X,M.r[0]);
+ vResult = _mm_add_ps(vResult,X);
+ _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),vResult);
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+ return pOutputStream;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMFLOAT4* XMVector2TransformStreamNC
+(
+ XMFLOAT4* pOutputStream,
+ size_t OutputStride,
+ CONST XMFLOAT2* pInputStream,
+ size_t InputStride,
+ size_t VectorCount,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_)
+ return XMVector2TransformStream( pOutputStream, OutputStride, pInputStream, InputStride, VectorCount, M );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2TransformCoord
+(
+ FXMVECTOR V,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR X;
+ XMVECTOR Y;
+ XMVECTOR InverseW;
+ XMVECTOR Result;
+
+ Y = XMVectorSplatY(V);
+ X = XMVectorSplatX(V);
+
+ Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
+ Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+ InverseW = XMVectorSplatW(Result);
+ InverseW = XMVectorReciprocal(InverseW);
+
+ Result = XMVectorMultiply(Result, InverseW);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,0,0,0));
+ vResult = _mm_mul_ps(vResult,M.r[0]);
+ XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+ vTemp = _mm_mul_ps(vTemp,M.r[1]);
+ vResult = _mm_add_ps(vResult,vTemp);
+ vResult = _mm_add_ps(vResult,M.r[3]);
+ vTemp = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,3,3,3));
+ vResult = _mm_div_ps(vResult,vTemp);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMFLOAT2* XMVector2TransformCoordStream
+(
+ XMFLOAT2* pOutputStream,
+ size_t OutputStride,
+ CONST XMFLOAT2* pInputStream,
+ size_t InputStride,
+ size_t VectorCount,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ XMVECTOR X;
+ XMVECTOR Y;
+ XMVECTOR InverseW;
+ XMVECTOR Result;
+ size_t i;
+ CONST BYTE* pInputVector = (CONST BYTE*)pInputStream;
+ BYTE* pOutputVector = (BYTE*)pOutputStream;
+
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
+ Y = XMVectorSplatY(V);
+ X = XMVectorSplatX(V);
+// Y = XMVectorReplicate(((XMFLOAT2*)pInputVector)->y);
+// X = XMVectorReplicate(((XMFLOAT2*)pInputVector)->x);
+
+ Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
+ Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+ InverseW = XMVectorSplatW(Result);
+ InverseW = XMVectorReciprocal(InverseW);
+
+ Result = XMVectorMultiply(Result, InverseW);
+
+ XMStoreFloat2((XMFLOAT2*)pOutputVector, Result);
+
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+
+ return pOutputStream;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+ size_t i;
+ CONST BYTE *pInputVector = (CONST BYTE*)pInputStream;
+ BYTE *pOutputVector = (BYTE*)pOutputStream;
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ XMVECTOR X = _mm_load_ps1(&reinterpret_cast<const XMFLOAT2*>(pInputVector)->x);
+ XMVECTOR vResult = _mm_load_ps1(&reinterpret_cast<const XMFLOAT2*>(pInputVector)->y);
+ vResult = _mm_mul_ps(vResult,M.r[1]);
+ vResult = _mm_add_ps(vResult,M.r[3]);
+ X = _mm_mul_ps(X,M.r[0]);
+ vResult = _mm_add_ps(vResult,X);
+ X = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,3,3,3));
+ vResult = _mm_div_ps(vResult,X);
+ _mm_store_sd(reinterpret_cast<double *>(pOutputVector),reinterpret_cast<__m128d *>(&vResult)[0]);
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+ return pOutputStream;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector2TransformNormal
+(
+ FXMVECTOR V,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR X;
+ XMVECTOR Y;
+ XMVECTOR Result;
+
+ Y = XMVectorSplatY(V);
+ X = XMVectorSplatX(V);
+
+ Result = XMVectorMultiply(Y, M.r[1]);
+ Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,0,0,0));
+ vResult = _mm_mul_ps(vResult,M.r[0]);
+ XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+ vTemp = _mm_mul_ps(vTemp,M.r[1]);
+ vResult = _mm_add_ps(vResult,vTemp);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMFLOAT2* XMVector2TransformNormalStream
+(
+ XMFLOAT2* pOutputStream,
+ size_t OutputStride,
+ CONST XMFLOAT2* pInputStream,
+ size_t InputStride,
+ size_t VectorCount,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ XMVECTOR X;
+ XMVECTOR Y;
+ XMVECTOR Result;
+ size_t i;
+ CONST BYTE* pInputVector = (CONST BYTE*)pInputStream;
+ BYTE* pOutputVector = (BYTE*)pOutputStream;
+
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
+ Y = XMVectorSplatY(V);
+ X = XMVectorSplatX(V);
+// Y = XMVectorReplicate(((XMFLOAT2*)pInputVector)->y);
+// X = XMVectorReplicate(((XMFLOAT2*)pInputVector)->x);
+
+ Result = XMVectorMultiply(Y, M.r[1]);
+ Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+ XMStoreFloat2((XMFLOAT2*)pOutputVector, Result);
+
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+
+ return pOutputStream;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+ size_t i;
+ CONST BYTE*pInputVector = (CONST BYTE*)pInputStream;
+ BYTE *pOutputVector = (BYTE*)pOutputStream;
+ for (i = 0; i < VectorCount; i++)
+ {
+ XMVECTOR X = _mm_load_ps1(&reinterpret_cast<const XMFLOAT2 *>(pInputVector)->x);
+ XMVECTOR vResult = _mm_load_ps1(&reinterpret_cast<const XMFLOAT2 *>(pInputVector)->y);
+ vResult = _mm_mul_ps(vResult,M.r[1]);
+ X = _mm_mul_ps(X,M.r[0]);
+ vResult = _mm_add_ps(vResult,X);
+ _mm_store_sd(reinterpret_cast<double*>(pOutputVector),reinterpret_cast<const __m128d *>(&vResult)[0]);
+
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+
+ return pOutputStream;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+/****************************************************************************
+ *
+ * 3D Vector
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector3Equal
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+ return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAllTrue(XMVector3EqualR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector3EqualR
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT CR = 0;
+ if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
+ (V1.vector4_f32[2] == V2.vector4_f32[2]))
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
+ (V1.vector4_f32[2] != V2.vector4_f32[2]))
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+ int iTest = _mm_movemask_ps(vTemp)&7;
+ UINT CR = 0;
+ if (iTest==7)
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!iTest)
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector3EqualInt
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast<const __m128i *>(&V1)[0],reinterpret_cast<const __m128i *>(&V2)[0]);
+ return (((_mm_movemask_ps(reinterpret_cast<const __m128 *>(&vTemp)[0])&7)==7) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAllTrue(XMVector3EqualIntR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector3EqualIntR
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT CR = 0;
+ if ((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
+ (V1.vector4_u32[1] == V2.vector4_u32[1]) &&
+ (V1.vector4_u32[2] == V2.vector4_u32[2]))
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) &&
+ (V1.vector4_u32[1] != V2.vector4_u32[1]) &&
+ (V1.vector4_u32[2] != V2.vector4_u32[2]))
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast<const __m128i *>(&V1)[0],reinterpret_cast<const __m128i *>(&V2)[0]);
+ int iTemp = _mm_movemask_ps(reinterpret_cast<const __m128 *>(&vTemp)[0])&7;
+ UINT CR = 0;
+ if (iTemp==7)
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!iTemp)
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector3NearEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2,
+ FXMVECTOR Epsilon
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ FLOAT dx, dy, dz;
+
+ dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
+ dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
+ dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]);
+ return (((dx <= Epsilon.vector4_f32[0]) &&
+ (dy <= Epsilon.vector4_f32[1]) &&
+ (dz <= Epsilon.vector4_f32[2])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Get the difference
+ XMVECTOR vDelta = _mm_sub_ps(V1,V2);
+ // Get the absolute value of the difference
+ XMVECTOR vTemp = _mm_setzero_ps();
+ vTemp = _mm_sub_ps(vTemp,vDelta);
+ vTemp = _mm_max_ps(vTemp,vDelta);
+ vTemp = _mm_cmple_ps(vTemp,Epsilon);
+ // w is don't care
+ return (((_mm_movemask_ps(vTemp)&7)==0x7) != 0);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector3NotEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+ return (((_mm_movemask_ps(vTemp)&7)!=7) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAnyFalse(XMVector3EqualR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector3NotEqualInt
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast<const __m128i *>(&V1)[0],reinterpret_cast<const __m128i *>(&V2)[0]);
+ return (((_mm_movemask_ps(reinterpret_cast<const __m128 *>(&vTemp)[0])&7)!=7) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAnyFalse(XMVector3EqualIntR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector3Greater
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
+ return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAllTrue(XMVector3GreaterR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector3GreaterR
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT CR = 0;
+ if ((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] > V2.vector4_f32[1]) &&
+ (V1.vector4_f32[2] > V2.vector4_f32[2]))
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] <= V2.vector4_f32[1]) &&
+ (V1.vector4_f32[2] <= V2.vector4_f32[2]))
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
+ UINT CR = 0;
+ int iTest = _mm_movemask_ps(vTemp)&7;
+ if (iTest==7)
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!iTest)
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector3GreaterOrEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
+ return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAllTrue(XMVector3GreaterOrEqualR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector3GreaterOrEqualR
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ UINT CR = 0;
+ if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
+ (V1.vector4_f32[2] >= V2.vector4_f32[2]))
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
+ (V1.vector4_f32[2] < V2.vector4_f32[2]))
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
+ UINT CR = 0;
+ int iTest = _mm_movemask_ps(vTemp)&7;
+ if (iTest==7)
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!iTest)
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector3Less
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
+ return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAllTrue(XMVector3GreaterR(V2, V1));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector3LessOrEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
+ return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
+#else // _XM_VMX128_INTRINSICS_
+ return XMComparisonAllTrue(XMVector3GreaterOrEqualR(V2, V1));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector3InBounds
+(
+ FXMVECTOR V,
+ FXMVECTOR Bounds
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
+ (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
+ (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Test if less than or equal
+ XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
+ // Negate the bounds
+ XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
+ // Test if greater or equal (Reversed)
+ vTemp2 = _mm_cmple_ps(vTemp2,V);
+ // Blend answers
+ vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+ // x,y and z in bounds? (w is don't care)
+ return (((_mm_movemask_ps(vTemp1)&0x7)==0x7) != 0);
+#else
+ return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector3InBoundsR
+(
+ FXMVECTOR V,
+ FXMVECTOR Bounds
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT CR = 0;
+ if ((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
+ (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
+ (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]))
+ {
+ CR = XM_CRMASK_CR6BOUNDS;
+ }
+ return CR;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Test if less than or equal
+ XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
+ // Negate the bounds
+ XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
+ // Test if greater or equal (Reversed)
+ vTemp2 = _mm_cmple_ps(vTemp2,V);
+ // Blend answers
+ vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+ // x,y and z in bounds? (w is don't care)
+ return ((_mm_movemask_ps(vTemp1)&0x7)==0x7) ? XM_CRMASK_CR6BOUNDS : 0;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector3IsNaN
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ return (XMISNAN(V.vector4_f32[0]) ||
+ XMISNAN(V.vector4_f32[1]) ||
+ XMISNAN(V.vector4_f32[2]));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Mask off the exponent
+ __m128i vTempInf = _mm_and_si128(reinterpret_cast<const __m128i *>(&V)[0],g_XMInfinity);
+ // Mask off the mantissa
+ __m128i vTempNan = _mm_and_si128(reinterpret_cast<const __m128i *>(&V)[0],g_XMQNaNTest);
+ // Are any of the exponents == 0x7F800000?
+ vTempInf = _mm_cmpeq_epi32(vTempInf,g_XMInfinity);
+ // Are any of the mantissa's zero? (SSE2 doesn't have a neq test)
+ vTempNan = _mm_cmpeq_epi32(vTempNan,g_XMZero);
+ // Perform a not on the NaN test to be true on NON-zero mantissas
+ vTempNan = _mm_andnot_si128(vTempNan,vTempInf);
+ // If x, y or z are NaN, the signs are true after the merge above
+ return ((_mm_movemask_ps(reinterpret_cast<const __m128 *>(&vTempNan)[0])&7) != 0);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector3IsInfinite
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (XMISINF(V.vector4_f32[0]) ||
+ XMISINF(V.vector4_f32[1]) ||
+ XMISINF(V.vector4_f32[2]));
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Mask off the sign bit
+ __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
+ // Compare to infinity
+ vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
+ // If x,y or z are infinity, the signs are true.
+ return ((_mm_movemask_ps(vTemp)&7) != 0);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3Dot
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ FLOAT fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2];
+ XMVECTOR vResult = {
+ fValue,
+ fValue,
+ fValue,
+ fValue
+ };
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product
+ XMVECTOR vDot = _mm_mul_ps(V1,V2);
+ // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
+ XMVECTOR vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1));
+ // Result.vector4_f32[0] = x+y
+ vDot = _mm_add_ss(vDot,vTemp);
+ // x=Dot.vector4_f32[2]
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+ // Result.vector4_f32[0] = (x+y)+z
+ vDot = _mm_add_ss(vDot,vTemp);
+ // Splat x
+ return _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0));
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3Cross
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR vResult = {
+ (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]),
+ (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]),
+ (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]),
+ 0.0f
+ };
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // y1,z1,x1,w1
+ XMVECTOR vTemp1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(3,0,2,1));
+ // z2,x2,y2,w2
+ XMVECTOR vTemp2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(3,1,0,2));
+ // Perform the left operation
+ XMVECTOR vResult = _mm_mul_ps(vTemp1,vTemp2);
+ // z1,x1,y1,w1
+ vTemp1 = _mm_shuffle_ps(vTemp1,vTemp1,_MM_SHUFFLE(3,0,2,1));
+ // y2,z2,x2,w2
+ vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(3,1,0,2));
+ // Perform the right operation
+ vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
+ // Subract the right from left, and return answer
+ vResult = _mm_sub_ps(vResult,vTemp1);
+ // Set w to zero
+ return _mm_and_ps(vResult,g_XMMask3);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3LengthSq
+(
+ FXMVECTOR V
+)
+{
+ return XMVector3Dot(V, V);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3ReciprocalLengthEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result = XMVector3LengthSq(V);
+ Result = XMVectorReciprocalSqrtEst(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x,y and z
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has z and y
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,2,1,2));
+ // x+z, y
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ // y,y,y,y
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+ // x+z+y,??,??,??
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ // Splat the length squared
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+ // Get the reciprocal
+ vLengthSq = _mm_rsqrt_ps(vLengthSq);
+ return vLengthSq;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3ReciprocalLength
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result = XMVector3LengthSq(V);
+ Result = XMVectorReciprocalSqrt(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product
+ XMVECTOR vDot = _mm_mul_ps(V,V);
+ // x=Dot.y, y=Dot.z
+ XMVECTOR vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1));
+ // Result.x = x+y
+ vDot = _mm_add_ss(vDot,vTemp);
+ // x=Dot.z
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+ // Result.x = (x+y)+z
+ vDot = _mm_add_ss(vDot,vTemp);
+ // Splat x
+ vDot = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0));
+ // Get the reciprocal
+ vDot = _mm_sqrt_ps(vDot);
+ // Get the reciprocal
+ vDot = _mm_div_ps(g_XMOne,vDot);
+ return vDot;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3LengthEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result = XMVector3LengthSq(V);
+ Result = XMVectorSqrtEst(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x,y and z
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has z and y
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,2,1,2));
+ // x+z, y
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ // y,y,y,y
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+ // x+z+y,??,??,??
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ // Splat the length squared
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+ // Get the length
+ vLengthSq = _mm_sqrt_ps(vLengthSq);
+ return vLengthSq;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3Length
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result = XMVector3LengthSq(V);
+ Result = XMVectorSqrt(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x,y and z
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has z and y
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,2,1,2));
+ // x+z, y
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ // y,y,y,y
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+ // x+z+y,??,??,??
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ // Splat the length squared
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+ // Get the length
+ vLengthSq = _mm_sqrt_ps(vLengthSq);
+ return vLengthSq;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// XMVector3NormalizeEst uses a reciprocal estimate and
+// returns QNaN on zero and infinite vectors.
+
+XMFINLINE XMVECTOR XMVector3NormalizeEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+ Result = XMVector3ReciprocalLength(V);
+ Result = XMVectorMultiply(V, Result);
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product
+ XMVECTOR vDot = _mm_mul_ps(V,V);
+ // x=Dot.y, y=Dot.z
+ XMVECTOR vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1));
+ // Result.x = x+y
+ vDot = _mm_add_ss(vDot,vTemp);
+ // x=Dot.z
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+ // Result.x = (x+y)+z
+ vDot = _mm_add_ss(vDot,vTemp);
+ // Splat x
+ vDot = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0));
+ // Get the reciprocal
+ vDot = _mm_rsqrt_ps(vDot);
+ // Perform the normalization
+ vDot = _mm_mul_ps(vDot,V);
+ return vDot;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3Normalize
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ FLOAT fLength;
+ XMVECTOR vResult;
+
+ vResult = XMVector3Length( V );
+ fLength = vResult.vector4_f32[0];
+
+ // Prevent divide by zero
+ if (fLength > 0) {
+ fLength = 1.0f/fLength;
+ }
+
+ vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
+ vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
+ vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
+ vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x,y and z only
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,1,2,1));
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+ vLengthSq = _mm_add_ss(vLengthSq,vTemp);
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
+ // Prepare for the division
+ XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+ // Create zero with a single instruction
+ XMVECTOR vZeroMask = _mm_setzero_ps();
+ // Test for a divide by zero (Must be FP to detect -0.0)
+ vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+ // Failsafe on zero (Or epsilon) length planes
+ // If the length is infinity, set the elements to zero
+ vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+ // Divide to perform the normalization
+ vResult = _mm_div_ps(V,vResult);
+ // Any that are infinity, set to zero
+ vResult = _mm_and_ps(vResult,vZeroMask);
+ // Select qnan or result based on infinite length
+ XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+ XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+ vResult = _mm_or_ps(vTemp1,vTemp2);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3ClampLength
+(
+ FXMVECTOR V,
+ FLOAT LengthMin,
+ FLOAT LengthMax
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR ClampMax;
+ XMVECTOR ClampMin;
+
+ ClampMax = XMVectorReplicate(LengthMax);
+ ClampMin = XMVectorReplicate(LengthMin);
+
+ return XMVector3ClampLengthV(V, ClampMin, ClampMax);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR ClampMax = _mm_set_ps1(LengthMax);
+ XMVECTOR ClampMin = _mm_set_ps1(LengthMin);
+ return XMVector3ClampLengthV(V,ClampMin,ClampMax);
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3ClampLengthV
+(
+ FXMVECTOR V,
+ FXMVECTOR LengthMin,
+ FXMVECTOR LengthMax
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR ClampLength;
+ XMVECTOR LengthSq;
+ XMVECTOR RcpLength;
+ XMVECTOR Length;
+ XMVECTOR Normal;
+ XMVECTOR Zero;
+ XMVECTOR InfiniteLength;
+ XMVECTOR ZeroLength;
+ XMVECTOR Select;
+ XMVECTOR ControlMax;
+ XMVECTOR ControlMin;
+ XMVECTOR Control;
+ XMVECTOR Result;
+
+ XMASSERT((LengthMin.vector4_f32[1] == LengthMin.vector4_f32[0]) && (LengthMin.vector4_f32[2] == LengthMin.vector4_f32[0]));
+ XMASSERT((LengthMax.vector4_f32[1] == LengthMax.vector4_f32[0]) && (LengthMax.vector4_f32[2] == LengthMax.vector4_f32[0]));
+ XMASSERT(XMVector3GreaterOrEqual(LengthMin, XMVectorZero()));
+ XMASSERT(XMVector3GreaterOrEqual(LengthMax, XMVectorZero()));
+ XMASSERT(XMVector3GreaterOrEqual(LengthMax, LengthMin));
+
+ LengthSq = XMVector3LengthSq(V);
+
+ Zero = XMVectorZero();
+
+ RcpLength = XMVectorReciprocalSqrt(LengthSq);
+
+ InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
+ ZeroLength = XMVectorEqual(LengthSq, Zero);
+
+ Normal = XMVectorMultiply(V, RcpLength);
+
+ Length = XMVectorMultiply(LengthSq, RcpLength);
+
+ Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
+ Length = XMVectorSelect(LengthSq, Length, Select);
+ Normal = XMVectorSelect(LengthSq, Normal, Select);
+
+ ControlMax = XMVectorGreater(Length, LengthMax);
+ ControlMin = XMVectorLess(Length, LengthMin);
+
+ ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
+ ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
+
+ Result = XMVectorMultiply(Normal, ClampLength);
+
+ // Preserve the original vector (with no precision loss) if the length falls within the given range
+ Control = XMVectorEqualInt(ControlMax, ControlMin);
+ Result = XMVectorSelect(Result, V, Control);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR ClampLength;
+ XMVECTOR LengthSq;
+ XMVECTOR RcpLength;
+ XMVECTOR Length;
+ XMVECTOR Normal;
+ XMVECTOR InfiniteLength;
+ XMVECTOR ZeroLength;
+ XMVECTOR Select;
+ XMVECTOR ControlMax;
+ XMVECTOR ControlMin;
+ XMVECTOR Control;
+ XMVECTOR Result;
+
+ XMASSERT((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)));
+ XMASSERT((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)));
+ XMASSERT(XMVector3GreaterOrEqual(LengthMin, g_XMZero));
+ XMASSERT(XMVector3GreaterOrEqual(LengthMax, g_XMZero));
+ XMASSERT(XMVector3GreaterOrEqual(LengthMax, LengthMin));
+
+ LengthSq = XMVector3LengthSq(V);
+ RcpLength = XMVectorReciprocalSqrt(LengthSq);
+ InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity);
+ ZeroLength = XMVectorEqual(LengthSq,g_XMZero);
+ Normal = _mm_mul_ps(V, RcpLength);
+ Length = _mm_mul_ps(LengthSq, RcpLength);
+ Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
+ Length = XMVectorSelect(LengthSq, Length, Select);
+ Normal = XMVectorSelect(LengthSq, Normal, Select);
+ ControlMax = XMVectorGreater(Length, LengthMax);
+ ControlMin = XMVectorLess(Length, LengthMin);
+ ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
+ ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
+ Result = _mm_mul_ps(Normal, ClampLength);
+ // Preserve the original vector (with no precision loss) if the length falls within the given range
+ Control = XMVectorEqualInt(ControlMax, ControlMin);
+ Result = XMVectorSelect(Result, V, Control);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3Reflect
+(
+ FXMVECTOR Incident,
+ FXMVECTOR Normal
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ // Result = Incident - (2 * dot(Incident, Normal)) * Normal
+ Result = XMVector3Dot(Incident, Normal);
+ Result = XMVectorAdd(Result, Result);
+ Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Result = Incident - (2 * dot(Incident, Normal)) * Normal
+ XMVECTOR Result = XMVector3Dot(Incident, Normal);
+ Result = _mm_add_ps(Result, Result);
+ Result = _mm_mul_ps(Result, Normal);
+ Result = _mm_sub_ps(Incident,Result);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3Refract
+(
+ FXMVECTOR Incident,
+ FXMVECTOR Normal,
+ FLOAT RefractionIndex
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Index;
+ Index = XMVectorReplicate(RefractionIndex);
+ return XMVector3RefractV(Incident, Normal, Index);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR Index = _mm_set_ps1(RefractionIndex);
+ return XMVector3RefractV(Incident,Normal,Index);
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3RefractV
+(
+ FXMVECTOR Incident,
+ FXMVECTOR Normal,
+ FXMVECTOR RefractionIndex
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR IDotN;
+ XMVECTOR R;
+ CONST XMVECTOR Zero = XMVectorZero();
+
+ // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
+ // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+
+ IDotN = XMVector3Dot(Incident, Normal);
+
+ // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+ R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
+ R = XMVectorMultiply(R, RefractionIndex);
+ R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
+
+ if (XMVector4LessOrEqual(R, Zero))
+ {
+ // Total internal reflection
+ return Zero;
+ }
+ else
+ {
+ XMVECTOR Result;
+
+ // R = RefractionIndex * IDotN + sqrt(R)
+ R = XMVectorSqrt(R);
+ R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
+
+ // Result = RefractionIndex * Incident - Normal * R
+ Result = XMVectorMultiply(RefractionIndex, Incident);
+ Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
+
+ return Result;
+ }
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
+ // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+ XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
+ // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+ XMVECTOR R = _mm_mul_ps(IDotN, IDotN);
+ R = _mm_sub_ps(g_XMOne,R);
+ R = _mm_mul_ps(R, RefractionIndex);
+ R = _mm_mul_ps(R, RefractionIndex);
+ R = _mm_sub_ps(g_XMOne,R);
+
+ XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero);
+ if (_mm_movemask_ps(vResult)==0x0f)
+ {
+ // Total internal reflection
+ vResult = g_XMZero;
+ }
+ else
+ {
+ // R = RefractionIndex * IDotN + sqrt(R)
+ R = _mm_sqrt_ps(R);
+ vResult = _mm_mul_ps(RefractionIndex,IDotN);
+ R = _mm_add_ps(R,vResult);
+ // Result = RefractionIndex * Incident - Normal * R
+ vResult = _mm_mul_ps(RefractionIndex, Incident);
+ R = _mm_mul_ps(R,Normal);
+ vResult = _mm_sub_ps(vResult,R);
+ }
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3Orthogonal
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR NegativeV;
+ XMVECTOR Z, YZYY;
+ XMVECTOR ZIsNegative, YZYYIsNegative;
+ XMVECTOR S, D;
+ XMVECTOR R0, R1;
+ XMVECTOR Select;
+ XMVECTOR Zero;
+ XMVECTOR Result;
+ static CONST XMVECTORU32 Permute1X0X0X0X = {XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X};
+ static CONST XMVECTORU32 Permute0Y0Z0Y0Y= {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_0Y};
+
+ Zero = XMVectorZero();
+ Z = XMVectorSplatZ(V);
+ YZYY = XMVectorPermute(V, V, Permute0Y0Z0Y0Y.v);
+
+ NegativeV = XMVectorSubtract(Zero, V);
+
+ ZIsNegative = XMVectorLess(Z, Zero);
+ YZYYIsNegative = XMVectorLess(YZYY, Zero);
+
+ S = XMVectorAdd(YZYY, Z);
+ D = XMVectorSubtract(YZYY, Z);
+
+ Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative);
+
+ R0 = XMVectorPermute(NegativeV, S, Permute1X0X0X0X.v);
+ R1 = XMVectorPermute(V, D, Permute1X0X0X0X.v);
+
+ Result = XMVectorSelect(R1, R0, Select);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR NegativeV;
+ XMVECTOR Z, YZYY;
+ XMVECTOR ZIsNegative, YZYYIsNegative;
+ XMVECTOR S, D;
+ XMVECTOR R0, R1;
+ XMVECTOR Select;
+ XMVECTOR Zero;
+ XMVECTOR Result;
+ static CONST XMVECTORI32 Permute1X0X0X0X = {XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X};
+ static CONST XMVECTORI32 Permute0Y0Z0Y0Y= {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_0Y};
+
+ Zero = XMVectorZero();
+ Z = XMVectorSplatZ(V);
+ YZYY = XMVectorPermute(V, V, Permute0Y0Z0Y0Y);
+
+ NegativeV = _mm_sub_ps(Zero, V);
+
+ ZIsNegative = XMVectorLess(Z, Zero);
+ YZYYIsNegative = XMVectorLess(YZYY, Zero);
+
+ S = _mm_add_ps(YZYY, Z);
+ D = _mm_sub_ps(YZYY, Z);
+
+ Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative);
+
+ R0 = XMVectorPermute(NegativeV, S, Permute1X0X0X0X);
+ R1 = XMVectorPermute(V, D,Permute1X0X0X0X);
+ Result = XMVectorSelect(R1, R0, Select);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3AngleBetweenNormalsEst
+(
+ FXMVECTOR N1,
+ FXMVECTOR N2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+ XMVECTOR NegativeOne;
+ XMVECTOR One;
+
+ Result = XMVector3Dot(N1, N2);
+ NegativeOne = XMVectorSplatConstant(-1, 0);
+ One = XMVectorSplatOne();
+ Result = XMVectorClamp(Result, NegativeOne, One);
+ Result = XMVectorACosEst(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = XMVector3Dot(N1,N2);
+ // Clamp to -1.0f to 1.0f
+ vResult = _mm_max_ps(vResult,g_XMNegativeOne);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ vResult = XMVectorACosEst(vResult);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3AngleBetweenNormals
+(
+ FXMVECTOR N1,
+ FXMVECTOR N2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+ XMVECTOR NegativeOne;
+ XMVECTOR One;
+
+ Result = XMVector3Dot(N1, N2);
+ NegativeOne = XMVectorSplatConstant(-1, 0);
+ One = XMVectorSplatOne();
+ Result = XMVectorClamp(Result, NegativeOne, One);
+ Result = XMVectorACos(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = XMVector3Dot(N1,N2);
+ // Clamp to -1.0f to 1.0f
+ vResult = _mm_max_ps(vResult,g_XMNegativeOne);
+ vResult = _mm_min_ps(vResult,g_XMOne);
+ vResult = XMVectorACos(vResult);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3AngleBetweenVectors
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR L1;
+ XMVECTOR L2;
+ XMVECTOR Dot;
+ XMVECTOR CosAngle;
+ XMVECTOR NegativeOne;
+ XMVECTOR One;
+ XMVECTOR Result;
+
+ L1 = XMVector3ReciprocalLength(V1);
+ L2 = XMVector3ReciprocalLength(V2);
+
+ Dot = XMVector3Dot(V1, V2);
+
+ L1 = XMVectorMultiply(L1, L2);
+
+ NegativeOne = XMVectorSplatConstant(-1, 0);
+ One = XMVectorSplatOne();
+
+ CosAngle = XMVectorMultiply(Dot, L1);
+
+ CosAngle = XMVectorClamp(CosAngle, NegativeOne, One);
+
+ Result = XMVectorACos(CosAngle);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR L1;
+ XMVECTOR L2;
+ XMVECTOR Dot;
+ XMVECTOR CosAngle;
+ XMVECTOR Result;
+
+ L1 = XMVector3ReciprocalLength(V1);
+ L2 = XMVector3ReciprocalLength(V2);
+ Dot = XMVector3Dot(V1, V2);
+ L1 = _mm_mul_ps(L1, L2);
+ CosAngle = _mm_mul_ps(Dot, L1);
+ CosAngle = XMVectorClamp(CosAngle,g_XMNegativeOne,g_XMOne);
+ Result = XMVectorACos(CosAngle);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3LinePointDistance
+(
+ FXMVECTOR LinePoint1,
+ FXMVECTOR LinePoint2,
+ FXMVECTOR Point
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR PointVector;
+ XMVECTOR LineVector;
+ XMVECTOR ReciprocalLengthSq;
+ XMVECTOR PointProjectionScale;
+ XMVECTOR DistanceVector;
+ XMVECTOR Result;
+
+ // Given a vector PointVector from LinePoint1 to Point and a vector
+ // LineVector from LinePoint1 to LinePoint2, the scaled distance
+ // PointProjectionScale from LinePoint1 to the perpendicular projection
+ // of PointVector onto the line is defined as:
+ //
+ // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)
+
+ PointVector = XMVectorSubtract(Point, LinePoint1);
+ LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
+
+ ReciprocalLengthSq = XMVector3LengthSq(LineVector);
+ ReciprocalLengthSq = XMVectorReciprocal(ReciprocalLengthSq);
+
+ PointProjectionScale = XMVector3Dot(PointVector, LineVector);
+ PointProjectionScale = XMVectorMultiply(PointProjectionScale, ReciprocalLengthSq);
+
+ DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
+ DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
+
+ Result = XMVector3Length(DistanceVector);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR PointVector = _mm_sub_ps(Point,LinePoint1);
+ XMVECTOR LineVector = _mm_sub_ps(LinePoint2,LinePoint1);
+ XMVECTOR ReciprocalLengthSq = XMVector3LengthSq(LineVector);
+ XMVECTOR vResult = XMVector3Dot(PointVector,LineVector);
+ vResult = _mm_div_ps(vResult,ReciprocalLengthSq);
+ vResult = _mm_mul_ps(vResult,LineVector);
+ vResult = _mm_sub_ps(PointVector,vResult);
+ vResult = XMVector3Length(vResult);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE VOID XMVector3ComponentsFromNormal
+(
+ XMVECTOR* pParallel,
+ XMVECTOR* pPerpendicular,
+ FXMVECTOR V,
+ FXMVECTOR Normal
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Parallel;
+ XMVECTOR Scale;
+
+ XMASSERT(pParallel);
+ XMASSERT(pPerpendicular);
+
+ Scale = XMVector3Dot(V, Normal);
+
+ Parallel = XMVectorMultiply(Normal, Scale);
+
+ *pParallel = Parallel;
+ *pPerpendicular = XMVectorSubtract(V, Parallel);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pParallel);
+ XMASSERT(pPerpendicular);
+ XMVECTOR Scale = XMVector3Dot(V, Normal);
+ XMVECTOR Parallel = _mm_mul_ps(Normal,Scale);
+ *pParallel = Parallel;
+ *pPerpendicular = _mm_sub_ps(V,Parallel);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Transform a vector using a rotation expressed as a unit quaternion
+
+XMFINLINE XMVECTOR XMVector3Rotate
+(
+ FXMVECTOR V,
+ FXMVECTOR RotationQuaternion
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR A;
+ XMVECTOR Q;
+ XMVECTOR Result;
+
+ A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
+ Q = XMQuaternionConjugate(RotationQuaternion);
+ Result = XMQuaternionMultiply(Q, A);
+ Result = XMQuaternionMultiply(Result, RotationQuaternion);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR A;
+ XMVECTOR Q;
+ XMVECTOR Result;
+
+ A = _mm_and_ps(V,g_XMMask3);
+ Q = XMQuaternionConjugate(RotationQuaternion);
+ Result = XMQuaternionMultiply(Q, A);
+ Result = XMQuaternionMultiply(Result, RotationQuaternion);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Transform a vector using the inverse of a rotation expressed as a unit quaternion
+
+XMFINLINE XMVECTOR XMVector3InverseRotate
+(
+ FXMVECTOR V,
+ FXMVECTOR RotationQuaternion
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR A;
+ XMVECTOR Q;
+ XMVECTOR Result;
+
+ A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
+ Result = XMQuaternionMultiply(RotationQuaternion, A);
+ Q = XMQuaternionConjugate(RotationQuaternion);
+ Result = XMQuaternionMultiply(Result, Q);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR A;
+ XMVECTOR Q;
+ XMVECTOR Result;
+ A = _mm_and_ps(V,g_XMMask3);
+ Result = XMQuaternionMultiply(RotationQuaternion, A);
+ Q = XMQuaternionConjugate(RotationQuaternion);
+ Result = XMQuaternionMultiply(Result, Q);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3Transform
+(
+ FXMVECTOR V,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR X;
+ XMVECTOR Y;
+ XMVECTOR Z;
+ XMVECTOR Result;
+
+ Z = XMVectorSplatZ(V);
+ Y = XMVectorSplatY(V);
+ X = XMVectorSplatX(V);
+
+ Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
+ Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+ Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,0,0,0));
+ vResult = _mm_mul_ps(vResult,M.r[0]);
+ XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+ vTemp = _mm_mul_ps(vTemp,M.r[1]);
+ vResult = _mm_add_ps(vResult,vTemp);
+ vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+ vTemp = _mm_mul_ps(vTemp,M.r[2]);
+ vResult = _mm_add_ps(vResult,vTemp);
+ vResult = _mm_add_ps(vResult,M.r[3]);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMFLOAT4* XMVector3TransformStream
+(
+ XMFLOAT4* pOutputStream,
+ size_t OutputStride,
+ CONST XMFLOAT3* pInputStream,
+ size_t InputStride,
+ size_t VectorCount,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ XMVECTOR X;
+ XMVECTOR Y;
+ XMVECTOR Z;
+ XMVECTOR Result;
+ size_t i;
+ CONST BYTE* pInputVector = (CONST BYTE*)pInputStream;
+ BYTE* pOutputVector = (BYTE*)pOutputStream;
+
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
+ Z = XMVectorSplatZ(V);
+ Y = XMVectorSplatY(V);
+ X = XMVectorSplatX(V);
+
+ Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
+ Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+ Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+ XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
+
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+
+ return pOutputStream;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+ size_t i;
+ CONST BYTE* pInputVector = (CONST BYTE*)pInputStream;
+ BYTE* pOutputVector = (BYTE*)pOutputStream;
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ XMVECTOR X = _mm_load_ps1(&reinterpret_cast<const XMFLOAT3 *>(pInputVector)->x);
+ XMVECTOR Y = _mm_load_ps1(&reinterpret_cast<const XMFLOAT3 *>(pInputVector)->y);
+ XMVECTOR vResult = _mm_load_ps1(&reinterpret_cast<const XMFLOAT3 *>(pInputVector)->z);
+ vResult = _mm_mul_ps(vResult,M.r[2]);
+ vResult = _mm_add_ps(vResult,M.r[3]);
+ Y = _mm_mul_ps(Y,M.r[1]);
+ vResult = _mm_add_ps(vResult,Y);
+ X = _mm_mul_ps(X,M.r[0]);
+ vResult = _mm_add_ps(vResult,X);
+ _mm_storeu_ps(reinterpret_cast<float *>(pOutputVector),vResult);
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+
+ return pOutputStream;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMFLOAT4* XMVector3TransformStreamNC
+(
+ XMFLOAT4* pOutputStream,
+ size_t OutputStride,
+ CONST XMFLOAT3* pInputStream,
+ size_t InputStride,
+ size_t VectorCount,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_)
+ return XMVector3TransformStream( pOutputStream, OutputStride, pInputStream, InputStride, VectorCount, M );
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3TransformCoord
+(
+ FXMVECTOR V,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR X;
+ XMVECTOR Y;
+ XMVECTOR Z;
+ XMVECTOR InverseW;
+ XMVECTOR Result;
+
+ Z = XMVectorSplatZ(V);
+ Y = XMVectorSplatY(V);
+ X = XMVectorSplatX(V);
+
+ Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
+ Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+ Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+ InverseW = XMVectorSplatW(Result);
+ InverseW = XMVectorReciprocal(InverseW);
+
+ Result = XMVectorMultiply(Result, InverseW);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,0,0,0));
+ vResult = _mm_mul_ps(vResult,M.r[0]);
+ XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+ vTemp = _mm_mul_ps(vTemp,M.r[1]);
+ vResult = _mm_add_ps(vResult,vTemp);
+ vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+ vTemp = _mm_mul_ps(vTemp,M.r[2]);
+ vResult = _mm_add_ps(vResult,vTemp);
+ vResult = _mm_add_ps(vResult,M.r[3]);
+ vTemp = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,3,3,3));
+ vResult = _mm_div_ps(vResult,vTemp);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMFLOAT3* XMVector3TransformCoordStream
+(
+ XMFLOAT3* pOutputStream,
+ size_t OutputStride,
+ CONST XMFLOAT3* pInputStream,
+ size_t InputStride,
+ size_t VectorCount,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ XMVECTOR X;
+ XMVECTOR Y;
+ XMVECTOR Z;
+ XMVECTOR InverseW;
+ XMVECTOR Result;
+ size_t i;
+ CONST BYTE* pInputVector = (CONST BYTE*)pInputStream;
+ BYTE* pOutputVector = (BYTE*)pOutputStream;
+
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
+ Z = XMVectorSplatZ(V);
+ Y = XMVectorSplatY(V);
+ X = XMVectorSplatX(V);
+// Z = XMVectorReplicate(((XMFLOAT3*)pInputVector)->z);
+// Y = XMVectorReplicate(((XMFLOAT3*)pInputVector)->y);
+// X = XMVectorReplicate(((XMFLOAT3*)pInputVector)->x);
+
+ Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
+ Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+ Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+ InverseW = XMVectorSplatW(Result);
+ InverseW = XMVectorReciprocal(InverseW);
+
+ Result = XMVectorMultiply(Result, InverseW);
+
+ XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
+
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+
+ return pOutputStream;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+
+ size_t i;
+ CONST BYTE *pInputVector = (CONST BYTE*)pInputStream;
+ BYTE *pOutputVector = (BYTE*)pOutputStream;
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ XMVECTOR X = _mm_load_ps1(&reinterpret_cast<const XMFLOAT3 *>(pInputVector)->x);
+ XMVECTOR Y = _mm_load_ps1(&reinterpret_cast<const XMFLOAT3 *>(pInputVector)->y);
+ XMVECTOR vResult = _mm_load_ps1(&reinterpret_cast<const XMFLOAT3 *>(pInputVector)->z);
+ vResult = _mm_mul_ps(vResult,M.r[2]);
+ vResult = _mm_add_ps(vResult,M.r[3]);
+ Y = _mm_mul_ps(Y,M.r[1]);
+ vResult = _mm_add_ps(vResult,Y);
+ X = _mm_mul_ps(X,M.r[0]);
+ vResult = _mm_add_ps(vResult,X);
+
+ X = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,3,3,3));
+ vResult = _mm_div_ps(vResult,X);
+ _mm_store_ss(&reinterpret_cast<XMFLOAT3 *>(pOutputVector)->x,vResult);
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(&reinterpret_cast<XMFLOAT3 *>(pOutputVector)->y,vResult);
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(&reinterpret_cast<XMFLOAT3 *>(pOutputVector)->z,vResult);
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+
+ return pOutputStream;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3TransformNormal
+(
+ FXMVECTOR V,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR X;
+ XMVECTOR Y;
+ XMVECTOR Z;
+ XMVECTOR Result;
+
+ Z = XMVectorSplatZ(V);
+ Y = XMVectorSplatY(V);
+ X = XMVectorSplatX(V);
+
+ Result = XMVectorMultiply(Z, M.r[2]);
+ Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+ Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,0,0,0));
+ vResult = _mm_mul_ps(vResult,M.r[0]);
+ XMVECTOR vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+ vTemp = _mm_mul_ps(vTemp,M.r[1]);
+ vResult = _mm_add_ps(vResult,vTemp);
+ vTemp = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+ vTemp = _mm_mul_ps(vTemp,M.r[2]);
+ vResult = _mm_add_ps(vResult,vTemp);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMFLOAT3* XMVector3TransformNormalStream
+(
+ XMFLOAT3* pOutputStream,
+ size_t OutputStride,
+ CONST XMFLOAT3* pInputStream,
+ size_t InputStride,
+ size_t VectorCount,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ XMVECTOR X;
+ XMVECTOR Y;
+ XMVECTOR Z;
+ XMVECTOR Result;
+ size_t i;
+ CONST BYTE* pInputVector = (CONST BYTE*)pInputStream;
+ BYTE* pOutputVector = (BYTE*)pOutputStream;
+
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
+ Z = XMVectorSplatZ(V);
+ Y = XMVectorSplatY(V);
+ X = XMVectorSplatX(V);
+// Z = XMVectorReplicate(((XMFLOAT3*)pInputVector)->z);
+// Y = XMVectorReplicate(((XMFLOAT3*)pInputVector)->y);
+// X = XMVectorReplicate(((XMFLOAT3*)pInputVector)->x);
+
+ Result = XMVectorMultiply(Z, M.r[2]);
+ Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+ Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+ XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
+
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+
+ return pOutputStream;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+
+ size_t i;
+ CONST BYTE *pInputVector = (CONST BYTE*)pInputStream;
+ BYTE *pOutputVector = (BYTE*)pOutputStream;
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ XMVECTOR X = _mm_load_ps1(&reinterpret_cast<const XMFLOAT3 *>(pInputVector)->x);
+ XMVECTOR Y = _mm_load_ps1(&reinterpret_cast<const XMFLOAT3 *>(pInputVector)->y);
+ XMVECTOR vResult = _mm_load_ps1(&reinterpret_cast<const XMFLOAT3 *>(pInputVector)->z);
+ vResult = _mm_mul_ps(vResult,M.r[2]);
+ Y = _mm_mul_ps(Y,M.r[1]);
+ vResult = _mm_add_ps(vResult,Y);
+ X = _mm_mul_ps(X,M.r[0]);
+ vResult = _mm_add_ps(vResult,X);
+ _mm_store_ss(&reinterpret_cast<XMFLOAT3 *>(pOutputVector)->x,vResult);
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(&reinterpret_cast<XMFLOAT3 *>(pOutputVector)->y,vResult);
+ vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(&reinterpret_cast<XMFLOAT3 *>(pOutputVector)->z,vResult);
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+
+ return pOutputStream;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMVECTOR XMVector3Project
+(
+ FXMVECTOR V,
+ FLOAT ViewportX,
+ FLOAT ViewportY,
+ FLOAT ViewportWidth,
+ FLOAT ViewportHeight,
+ FLOAT ViewportMinZ,
+ FLOAT ViewportMaxZ,
+ CXMMATRIX Projection,
+ CXMMATRIX View,
+ CXMMATRIX World
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX Transform;
+ XMVECTOR Scale;
+ XMVECTOR Offset;
+ XMVECTOR Result;
+ FLOAT HalfViewportWidth = ViewportWidth * 0.5f;
+ FLOAT HalfViewportHeight = ViewportHeight * 0.5f;
+
+ Scale = XMVectorSet(HalfViewportWidth,
+ -HalfViewportHeight,
+ ViewportMaxZ - ViewportMinZ,
+ 0.0f);
+
+ Offset = XMVectorSet(ViewportX + HalfViewportWidth,
+ ViewportY + HalfViewportHeight,
+ ViewportMinZ,
+ 0.0f);
+
+ Transform = XMMatrixMultiply(World, View);
+ Transform = XMMatrixMultiply(Transform, Projection);
+
+ Result = XMVector3TransformCoord(V, Transform);
+
+ Result = XMVectorMultiplyAdd(Result, Scale, Offset);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX Transform;
+ XMVECTOR Scale;
+ XMVECTOR Offset;
+ XMVECTOR Result;
+ FLOAT HalfViewportWidth = ViewportWidth * 0.5f;
+ FLOAT HalfViewportHeight = ViewportHeight * 0.5f;
+
+ Scale = XMVectorSet(HalfViewportWidth,
+ -HalfViewportHeight,
+ ViewportMaxZ - ViewportMinZ,
+ 0.0f);
+
+ Offset = XMVectorSet(ViewportX + HalfViewportWidth,
+ ViewportY + HalfViewportHeight,
+ ViewportMinZ,
+ 0.0f);
+ Transform = XMMatrixMultiply(World, View);
+ Transform = XMMatrixMultiply(Transform, Projection);
+ Result = XMVector3TransformCoord(V, Transform);
+ Result = _mm_mul_ps(Result,Scale);
+ Result = _mm_add_ps(Result,Offset);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMFLOAT3* XMVector3ProjectStream
+(
+ XMFLOAT3* pOutputStream,
+ size_t OutputStride,
+ CONST XMFLOAT3* pInputStream,
+ size_t InputStride,
+ size_t VectorCount,
+ FLOAT ViewportX,
+ FLOAT ViewportY,
+ FLOAT ViewportWidth,
+ FLOAT ViewportHeight,
+ FLOAT ViewportMinZ,
+ FLOAT ViewportMaxZ,
+ CXMMATRIX Projection,
+ CXMMATRIX View,
+ CXMMATRIX World
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX Transform;
+ XMVECTOR V;
+ XMVECTOR Scale;
+ XMVECTOR Offset;
+ XMVECTOR Result;
+ size_t i;
+ FLOAT HalfViewportWidth = ViewportWidth * 0.5f;
+ FLOAT HalfViewportHeight = ViewportHeight * 0.5f;
+ CONST BYTE* pInputVector = (CONST BYTE*)pInputStream;
+ BYTE* pOutputVector = (BYTE*)pOutputStream;
+
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+
+ Scale = XMVectorSet(HalfViewportWidth,
+ -HalfViewportHeight,
+ ViewportMaxZ - ViewportMinZ,
+ 1.0f);
+
+ Offset = XMVectorSet(ViewportX + HalfViewportWidth,
+ ViewportY + HalfViewportHeight,
+ ViewportMinZ,
+ 0.0f);
+
+ Transform = XMMatrixMultiply(World, View);
+ Transform = XMMatrixMultiply(Transform, Projection);
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
+
+ Result = XMVector3TransformCoord(V, Transform);
+
+ Result = XMVectorMultiplyAdd(Result, Scale, Offset);
+
+ XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
+
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+
+ return pOutputStream;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+ XMMATRIX Transform;
+ XMVECTOR V;
+ XMVECTOR Scale;
+ XMVECTOR Offset;
+ XMVECTOR Result;
+ size_t i;
+ FLOAT HalfViewportWidth = ViewportWidth * 0.5f;
+ FLOAT HalfViewportHeight = ViewportHeight * 0.5f;
+ CONST BYTE* pInputVector = (CONST BYTE*)pInputStream;
+ BYTE* pOutputVector = (BYTE*)pOutputStream;
+
+ Scale = XMVectorSet(HalfViewportWidth,
+ -HalfViewportHeight,
+ ViewportMaxZ - ViewportMinZ,
+ 1.0f);
+
+ Offset = XMVectorSet(ViewportX + HalfViewportWidth,
+ ViewportY + HalfViewportHeight,
+ ViewportMinZ,
+ 0.0f);
+
+ Transform = XMMatrixMultiply(World, View);
+ Transform = XMMatrixMultiply(Transform, Projection);
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
+
+ Result = XMVector3TransformCoord(V, Transform);
+
+ Result = _mm_mul_ps(Result,Scale);
+ Result = _mm_add_ps(Result,Offset);
+ XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+ return pOutputStream;
+
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector3Unproject
+(
+ FXMVECTOR V,
+ FLOAT ViewportX,
+ FLOAT ViewportY,
+ FLOAT ViewportWidth,
+ FLOAT ViewportHeight,
+ FLOAT ViewportMinZ,
+ FLOAT ViewportMaxZ,
+ CXMMATRIX Projection,
+ CXMMATRIX View,
+ CXMMATRIX World
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX Transform;
+ XMVECTOR Scale;
+ XMVECTOR Offset;
+ XMVECTOR Determinant;
+ XMVECTOR Result;
+ CONST XMVECTOR D = XMVectorSet(-1.0f, 1.0f, 0.0f, 0.0f);
+
+ Scale = XMVectorSet(ViewportWidth * 0.5f,
+ -ViewportHeight * 0.5f,
+ ViewportMaxZ - ViewportMinZ,
+ 1.0f);
+ Scale = XMVectorReciprocal(Scale);
+
+ Offset = XMVectorSet(-ViewportX,
+ -ViewportY,
+ -ViewportMinZ,
+ 0.0f);
+ Offset = XMVectorMultiplyAdd(Scale, Offset, D);
+
+ Transform = XMMatrixMultiply(World, View);
+ Transform = XMMatrixMultiply(Transform, Projection);
+ Transform = XMMatrixInverse(&Determinant, Transform);
+
+ Result = XMVectorMultiplyAdd(V, Scale, Offset);
+
+ Result = XMVector3TransformCoord(Result, Transform);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMMATRIX Transform;
+ XMVECTOR Scale;
+ XMVECTOR Offset;
+ XMVECTOR Determinant;
+ XMVECTOR Result;
+ CONST XMVECTORF32 D = {-1.0f, 1.0f, 0.0f, 0.0f};
+
+ Scale = XMVectorSet(ViewportWidth * 0.5f,
+ -ViewportHeight * 0.5f,
+ ViewportMaxZ - ViewportMinZ,
+ 1.0f);
+ Scale = XMVectorReciprocal(Scale);
+
+ Offset = XMVectorSet(-ViewportX,
+ -ViewportY,
+ -ViewportMinZ,
+ 0.0f);
+ Offset = _mm_mul_ps(Offset,Scale);
+ Offset = _mm_add_ps(Offset,D);
+
+ Transform = XMMatrixMultiply(World, View);
+ Transform = XMMatrixMultiply(Transform, Projection);
+ Transform = XMMatrixInverse(&Determinant, Transform);
+
+ Result = _mm_mul_ps(V,Scale);
+ Result = _mm_add_ps(Result,Offset);
+
+ Result = XMVector3TransformCoord(Result, Transform);
+
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMFLOAT3* XMVector3UnprojectStream
+(
+ XMFLOAT3* pOutputStream,
+ size_t OutputStride,
+ CONST XMFLOAT3* pInputStream,
+ size_t InputStride,
+ size_t VectorCount,
+ FLOAT ViewportX,
+ FLOAT ViewportY,
+ FLOAT ViewportWidth,
+ FLOAT ViewportHeight,
+ FLOAT ViewportMinZ,
+ FLOAT ViewportMaxZ,
+ CXMMATRIX Projection,
+ CXMMATRIX View,
+ CXMMATRIX World)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMMATRIX Transform;
+ XMVECTOR Scale;
+ XMVECTOR Offset;
+ XMVECTOR V;
+ XMVECTOR Determinant;
+ XMVECTOR Result;
+ size_t i;
+ CONST BYTE* pInputVector = (CONST BYTE*)pInputStream;
+ BYTE* pOutputVector = (BYTE*)pOutputStream;
+ CONST XMVECTOR D = XMVectorSet(-1.0f, 1.0f, 0.0f, 0.0f);
+
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+
+ Scale = XMVectorSet(ViewportWidth * 0.5f,
+ -ViewportHeight * 0.5f,
+ ViewportMaxZ - ViewportMinZ,
+ 1.0f);
+ Scale = XMVectorReciprocal(Scale);
+
+ Offset = XMVectorSet(-ViewportX,
+ -ViewportY,
+ -ViewportMinZ,
+ 0.0f);
+ Offset = XMVectorMultiplyAdd(Scale, Offset, D);
+
+ Transform = XMMatrixMultiply(World, View);
+ Transform = XMMatrixMultiply(Transform, Projection);
+ Transform = XMMatrixInverse(&Determinant, Transform);
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
+
+ Result = XMVectorMultiplyAdd(V, Scale, Offset);
+
+ Result = XMVector3TransformCoord(Result, Transform);
+
+ XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
+
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+
+ return pOutputStream;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+ XMMATRIX Transform;
+ XMVECTOR Scale;
+ XMVECTOR Offset;
+ XMVECTOR V;
+ XMVECTOR Determinant;
+ XMVECTOR Result;
+ size_t i;
+ CONST BYTE* pInputVector = (CONST BYTE*)pInputStream;
+ BYTE* pOutputVector = (BYTE*)pOutputStream;
+ CONST XMVECTORF32 D = {-1.0f, 1.0f, 0.0f, 0.0f};
+
+ Scale = XMVectorSet(ViewportWidth * 0.5f,
+ -ViewportHeight * 0.5f,
+ ViewportMaxZ - ViewportMinZ,
+ 1.0f);
+ Scale = XMVectorReciprocal(Scale);
+
+ Offset = XMVectorSet(-ViewportX,
+ -ViewportY,
+ -ViewportMinZ,
+ 0.0f);
+ Offset = _mm_mul_ps(Offset,Scale);
+ Offset = _mm_add_ps(Offset,D);
+
+ Transform = XMMatrixMultiply(World, View);
+ Transform = XMMatrixMultiply(Transform, Projection);
+ Transform = XMMatrixInverse(&Determinant, Transform);
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
+
+ Result = XMVectorMultiplyAdd(V, Scale, Offset);
+
+ Result = XMVector3TransformCoord(Result, Transform);
+
+ XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
+
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+
+ return pOutputStream;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+/****************************************************************************
+ *
+ * 4D Vector
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector4Equal
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+ return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
+#else
+ return XMComparisonAllTrue(XMVector4EqualR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector4EqualR
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ UINT CR = 0;
+
+ if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
+ (V1.vector4_f32[2] == V2.vector4_f32[2]) &&
+ (V1.vector4_f32[3] == V2.vector4_f32[3]))
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
+ (V1.vector4_f32[2] != V2.vector4_f32[2]) &&
+ (V1.vector4_f32[3] != V2.vector4_f32[3]))
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
+ int iTest = _mm_movemask_ps(vTemp);
+ UINT CR = 0;
+ if (iTest==0xf) // All equal?
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (iTest==0) // All not equal?
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector4EqualInt
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast<const __m128i *>(&V1)[0],reinterpret_cast<const __m128i *>(&V2)[0]);
+ return ((_mm_movemask_ps(reinterpret_cast<const __m128 *>(&vTemp)[0])==0xf) != 0);
+#else
+ return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector4EqualIntR
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT CR = 0;
+ if (V1.vector4_u32[0] == V2.vector4_u32[0] &&
+ V1.vector4_u32[1] == V2.vector4_u32[1] &&
+ V1.vector4_u32[2] == V2.vector4_u32[2] &&
+ V1.vector4_u32[3] == V2.vector4_u32[3])
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (V1.vector4_u32[0] != V2.vector4_u32[0] &&
+ V1.vector4_u32[1] != V2.vector4_u32[1] &&
+ V1.vector4_u32[2] != V2.vector4_u32[2] &&
+ V1.vector4_u32[3] != V2.vector4_u32[3])
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast<const __m128i *>(&V1)[0],reinterpret_cast<const __m128i *>(&V2)[0]);
+ int iTest = _mm_movemask_ps(reinterpret_cast<const __m128 *>(&vTemp)[0]);
+ UINT CR = 0;
+ if (iTest==0xf) // All equal?
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (iTest==0) // All not equal?
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+XMFINLINE BOOL XMVector4NearEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2,
+ FXMVECTOR Epsilon
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ FLOAT dx, dy, dz, dw;
+
+ dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
+ dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
+ dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]);
+ dw = fabsf(V1.vector4_f32[3]-V2.vector4_f32[3]);
+ return (((dx <= Epsilon.vector4_f32[0]) &&
+ (dy <= Epsilon.vector4_f32[1]) &&
+ (dz <= Epsilon.vector4_f32[2]) &&
+ (dw <= Epsilon.vector4_f32[3])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Get the difference
+ XMVECTOR vDelta = _mm_sub_ps(V1,V2);
+ // Get the absolute value of the difference
+ XMVECTOR vTemp = _mm_setzero_ps();
+ vTemp = _mm_sub_ps(vTemp,vDelta);
+ vTemp = _mm_max_ps(vTemp,vDelta);
+ vTemp = _mm_cmple_ps(vTemp,Epsilon);
+ return ((_mm_movemask_ps(vTemp)==0xf) != 0);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector4NotEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpneq_ps(V1,V2);
+ return ((_mm_movemask_ps(vTemp)) != 0);
+#else
+ return XMComparisonAnyFalse(XMVector4EqualR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector4NotEqualInt
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ __m128i vTemp = _mm_cmpeq_epi32(reinterpret_cast<const __m128i *>(&V1)[0],reinterpret_cast<const __m128i *>(&V2)[0]);
+ return ((_mm_movemask_ps(reinterpret_cast<const __m128 *>(&vTemp)[0])!=0xF) != 0);
+#else
+ return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector4Greater
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
+ return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
+#else
+ return XMComparisonAllTrue(XMVector4GreaterR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector4GreaterR
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT CR = 0;
+ if (V1.vector4_f32[0] > V2.vector4_f32[0] &&
+ V1.vector4_f32[1] > V2.vector4_f32[1] &&
+ V1.vector4_f32[2] > V2.vector4_f32[2] &&
+ V1.vector4_f32[3] > V2.vector4_f32[3])
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (V1.vector4_f32[0] <= V2.vector4_f32[0] &&
+ V1.vector4_f32[1] <= V2.vector4_f32[1] &&
+ V1.vector4_f32[2] <= V2.vector4_f32[2] &&
+ V1.vector4_f32[3] <= V2.vector4_f32[3])
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ UINT CR = 0;
+ XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
+ int iTest = _mm_movemask_ps(vTemp);
+ if (iTest==0xf) {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!iTest)
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector4GreaterOrEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
+ return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
+#else
+ return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector4GreaterOrEqualR
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ UINT CR = 0;
+ if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
+ (V1.vector4_f32[2] >= V2.vector4_f32[2]) &&
+ (V1.vector4_f32[3] >= V2.vector4_f32[3]))
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
+ (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
+ (V1.vector4_f32[2] < V2.vector4_f32[2]) &&
+ (V1.vector4_f32[3] < V2.vector4_f32[3]))
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ UINT CR = 0;
+ XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
+ int iTest = _mm_movemask_ps(vTemp);
+ if (iTest==0x0f)
+ {
+ CR = XM_CRMASK_CR6TRUE;
+ }
+ else if (!iTest)
+ {
+ CR = XM_CRMASK_CR6FALSE;
+ }
+ return CR;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector4Less
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
+ return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
+#else
+ return XMComparisonAllTrue(XMVector4GreaterR(V2, V1));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector4LessOrEqual
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
+ return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
+#else
+ return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector4InBounds
+(
+ FXMVECTOR V,
+ FXMVECTOR Bounds
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
+ (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
+ (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) &&
+ (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Test if less than or equal
+ XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
+ // Negate the bounds
+ XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
+ // Test if greater or equal (Reversed)
+ vTemp2 = _mm_cmple_ps(vTemp2,V);
+ // Blend answers
+ vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+ // All in bounds?
+ return ((_mm_movemask_ps(vTemp1)==0x0f) != 0);
+#else
+ return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE UINT XMVector4InBoundsR
+(
+ FXMVECTOR V,
+ FXMVECTOR Bounds
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ UINT CR = 0;
+ if ((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
+ (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
+ (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) &&
+ (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]))
+ {
+ CR = XM_CRMASK_CR6BOUNDS;
+ }
+ return CR;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Test if less than or equal
+ XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
+ // Negate the bounds
+ XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
+ // Test if greater or equal (Reversed)
+ vTemp2 = _mm_cmple_ps(vTemp2,V);
+ // Blend answers
+ vTemp1 = _mm_and_ps(vTemp1,vTemp2);
+ // All in bounds?
+ return (_mm_movemask_ps(vTemp1)==0x0f) ? XM_CRMASK_CR6BOUNDS : 0;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector4IsNaN
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ return (XMISNAN(V.vector4_f32[0]) ||
+ XMISNAN(V.vector4_f32[1]) ||
+ XMISNAN(V.vector4_f32[2]) ||
+ XMISNAN(V.vector4_f32[3]));
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Test against itself. NaN is always not equal
+ XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
+ // If any are NaN, the mask is non-zero
+ return (_mm_movemask_ps(vTempNan)!=0);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE BOOL XMVector4IsInfinite
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ return (XMISINF(V.vector4_f32[0]) ||
+ XMISINF(V.vector4_f32[1]) ||
+ XMISINF(V.vector4_f32[2]) ||
+ XMISINF(V.vector4_f32[3]));
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Mask off the sign bit
+ XMVECTOR vTemp = _mm_and_ps(V,g_XMAbsMask);
+ // Compare to infinity
+ vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
+ // If any are infinity, the signs are true.
+ return (_mm_movemask_ps(vTemp) != 0);
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4Dot
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result.vector4_f32[0] =
+ Result.vector4_f32[1] =
+ Result.vector4_f32[2] =
+ Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3];
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vTemp2 = V2;
+ XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2);
+ vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
+ vTemp2 = _mm_add_ps(vTemp2,vTemp); // Add Z = X+Z; W = Y+W;
+ vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position
+ vTemp = _mm_add_ps(vTemp,vTemp2); // Add Z and W together
+ return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4Cross
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2,
+ FXMVECTOR V3
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ XMVECTOR Result;
+
+ Result.vector4_f32[0] = (((V2.vector4_f32[2]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[2]))*V1.vector4_f32[1])-(((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[2])+(((V2.vector4_f32[1]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[1]))*V1.vector4_f32[3]);
+ Result.vector4_f32[1] = (((V2.vector4_f32[3]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[3]))*V1.vector4_f32[0])-(((V2.vector4_f32[3]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[3]))*V1.vector4_f32[2])+(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[3]);
+ Result.vector4_f32[2] = (((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[0])-(((V2.vector4_f32[0]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[0]))*V1.vector4_f32[1])+(((V2.vector4_f32[0]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[0]))*V1.vector4_f32[3]);
+ Result.vector4_f32[3] = (((V2.vector4_f32[2]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[2]))*V1.vector4_f32[0])-(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[1])+(((V2.vector4_f32[1]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[1]))*V1.vector4_f32[2]);
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // V2zwyz * V3wzwy
+ XMVECTOR vResult = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(2,1,3,2));
+ XMVECTOR vTemp3 = _mm_shuffle_ps(V3,V3,_MM_SHUFFLE(1,3,2,3));
+ vResult = _mm_mul_ps(vResult,vTemp3);
+ // - V2wzwy * V3zwyz
+ XMVECTOR vTemp2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(1,3,2,3));
+ vTemp3 = _mm_shuffle_ps(vTemp3,vTemp3,_MM_SHUFFLE(1,3,0,1));
+ vTemp2 = _mm_mul_ps(vTemp2,vTemp3);
+ vResult = _mm_sub_ps(vResult,vTemp2);
+ // term1 * V1yxxx
+ XMVECTOR vTemp1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(0,0,0,1));
+ vResult = _mm_mul_ps(vResult,vTemp1);
+
+ // V2ywxz * V3wxwx
+ vTemp2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(2,0,3,1));
+ vTemp3 = _mm_shuffle_ps(V3,V3,_MM_SHUFFLE(0,3,0,3));
+ vTemp3 = _mm_mul_ps(vTemp3,vTemp2);
+ // - V2wxwx * V3ywxz
+ vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(2,1,2,1));
+ vTemp1 = _mm_shuffle_ps(V3,V3,_MM_SHUFFLE(2,0,3,1));
+ vTemp2 = _mm_mul_ps(vTemp2,vTemp1);
+ vTemp3 = _mm_sub_ps(vTemp3,vTemp2);
+ // vResult - temp * V1zzyy
+ vTemp1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(1,1,2,2));
+ vTemp1 = _mm_mul_ps(vTemp1,vTemp3);
+ vResult = _mm_sub_ps(vResult,vTemp1);
+
+ // V2yzxy * V3zxyx
+ vTemp2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(1,0,2,1));
+ vTemp3 = _mm_shuffle_ps(V3,V3,_MM_SHUFFLE(0,1,0,2));
+ vTemp3 = _mm_mul_ps(vTemp3,vTemp2);
+ // - V2zxyx * V3yzxy
+ vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(2,0,2,1));
+ vTemp1 = _mm_shuffle_ps(V3,V3,_MM_SHUFFLE(1,0,2,1));
+ vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
+ vTemp3 = _mm_sub_ps(vTemp3,vTemp1);
+ // vResult + term * V1wwwz
+ vTemp1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(2,3,3,3));
+ vTemp3 = _mm_mul_ps(vTemp3,vTemp1);
+ vResult = _mm_add_ps(vResult,vTemp3);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4LengthSq
+(
+ FXMVECTOR V
+)
+{
+ return XMVector4Dot(V, V);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4ReciprocalLengthEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result = XMVector4LengthSq(V);
+ Result = XMVectorReciprocalSqrtEst(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x,y,z and w
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has z and w
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(3,2,3,2));
+ // x+z, y+w
+ vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+ // x+z,x+z,x+z,y+w
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,0,0,0));
+ // ??,??,y+w,y+w
+ vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
+ // ??,??,x+z+y+w,??
+ vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+ // Splat the length
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,2,2,2));
+ // Get the reciprocal
+ vLengthSq = _mm_rsqrt_ps(vLengthSq);
+ return vLengthSq;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4ReciprocalLength
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result = XMVector4LengthSq(V);
+ Result = XMVectorReciprocalSqrt(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x,y,z and w
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has z and w
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(3,2,3,2));
+ // x+z, y+w
+ vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+ // x+z,x+z,x+z,y+w
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,0,0,0));
+ // ??,??,y+w,y+w
+ vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
+ // ??,??,x+z+y+w,??
+ vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+ // Splat the length
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,2,2,2));
+ // Get the reciprocal
+ vLengthSq = _mm_sqrt_ps(vLengthSq);
+ // Accurate!
+ vLengthSq = _mm_div_ps(g_XMOne,vLengthSq);
+ return vLengthSq;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4LengthEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result = XMVector4LengthSq(V);
+ Result = XMVectorSqrtEst(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x,y,z and w
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has z and w
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(3,2,3,2));
+ // x+z, y+w
+ vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+ // x+z,x+z,x+z,y+w
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,0,0,0));
+ // ??,??,y+w,y+w
+ vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
+ // ??,??,x+z+y+w,??
+ vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+ // Splat the length
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,2,2,2));
+ // Prepare for the division
+ vLengthSq = _mm_sqrt_ps(vLengthSq);
+ return vLengthSq;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4Length
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ Result = XMVector4LengthSq(V);
+ Result = XMVectorSqrt(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x,y,z and w
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has z and w
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(3,2,3,2));
+ // x+z, y+w
+ vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+ // x+z,x+z,x+z,y+w
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,0,0,0));
+ // ??,??,y+w,y+w
+ vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
+ // ??,??,x+z+y+w,??
+ vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+ // Splat the length
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,2,2,2));
+ // Prepare for the division
+ vLengthSq = _mm_sqrt_ps(vLengthSq);
+ return vLengthSq;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+// XMVector4NormalizeEst uses a reciprocal estimate and
+// returns QNaN on zero and infinite vectors.
+
+XMFINLINE XMVECTOR XMVector4NormalizeEst
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+ Result = XMVector4ReciprocalLength(V);
+ Result = XMVectorMultiply(V, Result);
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x,y,z and w
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has z and w
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(3,2,3,2));
+ // x+z, y+w
+ vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+ // x+z,x+z,x+z,y+w
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,0,0,0));
+ // ??,??,y+w,y+w
+ vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
+ // ??,??,x+z+y+w,??
+ vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+ // Splat the length
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,2,2,2));
+ // Get the reciprocal
+ XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq);
+ // Reciprocal mul to perform the normalization
+ vResult = _mm_mul_ps(vResult,V);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4Normalize
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ FLOAT fLength;
+ XMVECTOR vResult;
+
+ vResult = XMVector4Length( V );
+ fLength = vResult.vector4_f32[0];
+
+ // Prevent divide by zero
+ if (fLength > 0) {
+ fLength = 1.0f/fLength;
+ }
+
+ vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
+ vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
+ vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
+ vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Perform the dot product on x,y,z and w
+ XMVECTOR vLengthSq = _mm_mul_ps(V,V);
+ // vTemp has z and w
+ XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(3,2,3,2));
+ // x+z, y+w
+ vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+ // x+z,x+z,x+z,y+w
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(1,0,0,0));
+ // ??,??,y+w,y+w
+ vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
+ // ??,??,x+z+y+w,??
+ vLengthSq = _mm_add_ps(vLengthSq,vTemp);
+ // Splat the length
+ vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,2,2,2));
+ // Prepare for the division
+ XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+ // Create zero with a single instruction
+ XMVECTOR vZeroMask = _mm_setzero_ps();
+ // Test for a divide by zero (Must be FP to detect -0.0)
+ vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+ // Failsafe on zero (Or epsilon) length planes
+ // If the length is infinity, set the elements to zero
+ vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+ // Divide to perform the normalization
+ vResult = _mm_div_ps(V,vResult);
+ // Any that are infinity, set to zero
+ vResult = _mm_and_ps(vResult,vZeroMask);
+ // Select qnan or result based on infinite length
+ XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+ XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+ vResult = _mm_or_ps(vTemp1,vTemp2);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4ClampLength
+(
+ FXMVECTOR V,
+ FLOAT LengthMin,
+ FLOAT LengthMax
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR ClampMax;
+ XMVECTOR ClampMin;
+
+ ClampMax = XMVectorReplicate(LengthMax);
+ ClampMin = XMVectorReplicate(LengthMin);
+
+ return XMVector4ClampLengthV(V, ClampMin, ClampMax);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR ClampMax = _mm_set_ps1(LengthMax);
+ XMVECTOR ClampMin = _mm_set_ps1(LengthMin);
+ return XMVector4ClampLengthV(V, ClampMin, ClampMax);
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4ClampLengthV
+(
+ FXMVECTOR V,
+ FXMVECTOR LengthMin,
+ FXMVECTOR LengthMax
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR ClampLength;
+ XMVECTOR LengthSq;
+ XMVECTOR RcpLength;
+ XMVECTOR Length;
+ XMVECTOR Normal;
+ XMVECTOR Zero;
+ XMVECTOR InfiniteLength;
+ XMVECTOR ZeroLength;
+ XMVECTOR Select;
+ XMVECTOR ControlMax;
+ XMVECTOR ControlMin;
+ XMVECTOR Control;
+ XMVECTOR Result;
+
+ XMASSERT((LengthMin.vector4_f32[1] == LengthMin.vector4_f32[0]) && (LengthMin.vector4_f32[2] == LengthMin.vector4_f32[0]) && (LengthMin.vector4_f32[3] == LengthMin.vector4_f32[0]));
+ XMASSERT((LengthMax.vector4_f32[1] == LengthMax.vector4_f32[0]) && (LengthMax.vector4_f32[2] == LengthMax.vector4_f32[0]) && (LengthMax.vector4_f32[3] == LengthMax.vector4_f32[0]));
+ XMASSERT(XMVector4GreaterOrEqual(LengthMin, XMVectorZero()));
+ XMASSERT(XMVector4GreaterOrEqual(LengthMax, XMVectorZero()));
+ XMASSERT(XMVector4GreaterOrEqual(LengthMax, LengthMin));
+
+ LengthSq = XMVector4LengthSq(V);
+
+ Zero = XMVectorZero();
+
+ RcpLength = XMVectorReciprocalSqrt(LengthSq);
+
+ InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
+ ZeroLength = XMVectorEqual(LengthSq, Zero);
+
+ Normal = XMVectorMultiply(V, RcpLength);
+
+ Length = XMVectorMultiply(LengthSq, RcpLength);
+
+ Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
+ Length = XMVectorSelect(LengthSq, Length, Select);
+ Normal = XMVectorSelect(LengthSq, Normal, Select);
+
+ ControlMax = XMVectorGreater(Length, LengthMax);
+ ControlMin = XMVectorLess(Length, LengthMin);
+
+ ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
+ ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
+
+ Result = XMVectorMultiply(Normal, ClampLength);
+
+ // Preserve the original vector (with no precision loss) if the length falls within the given range
+ Control = XMVectorEqualInt(ControlMax, ControlMin);
+ Result = XMVectorSelect(Result, V, Control);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR ClampLength;
+ XMVECTOR LengthSq;
+ XMVECTOR RcpLength;
+ XMVECTOR Length;
+ XMVECTOR Normal;
+ XMVECTOR Zero;
+ XMVECTOR InfiniteLength;
+ XMVECTOR ZeroLength;
+ XMVECTOR Select;
+ XMVECTOR ControlMax;
+ XMVECTOR ControlMin;
+ XMVECTOR Control;
+ XMVECTOR Result;
+
+ XMASSERT((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin)));
+ XMASSERT((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax)));
+ XMASSERT(XMVector4GreaterOrEqual(LengthMin, g_XMZero));
+ XMASSERT(XMVector4GreaterOrEqual(LengthMax, g_XMZero));
+ XMASSERT(XMVector4GreaterOrEqual(LengthMax, LengthMin));
+
+ LengthSq = XMVector4LengthSq(V);
+ Zero = XMVectorZero();
+ RcpLength = XMVectorReciprocalSqrt(LengthSq);
+ InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity);
+ ZeroLength = XMVectorEqual(LengthSq, Zero);
+ Normal = _mm_mul_ps(V, RcpLength);
+ Length = _mm_mul_ps(LengthSq, RcpLength);
+ Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
+ Length = XMVectorSelect(LengthSq, Length, Select);
+ Normal = XMVectorSelect(LengthSq, Normal, Select);
+ ControlMax = XMVectorGreater(Length, LengthMax);
+ ControlMin = XMVectorLess(Length, LengthMin);
+ ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
+ ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
+ Result = _mm_mul_ps(Normal, ClampLength);
+ // Preserve the original vector (with no precision loss) if the length falls within the given range
+ Control = XMVectorEqualInt(ControlMax,ControlMin);
+ Result = XMVectorSelect(Result,V,Control);
+ return Result;
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4Reflect
+(
+ FXMVECTOR Incident,
+ FXMVECTOR Normal
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+
+ // Result = Incident - (2 * dot(Incident, Normal)) * Normal
+ Result = XMVector4Dot(Incident, Normal);
+ Result = XMVectorAdd(Result, Result);
+ Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Result = Incident - (2 * dot(Incident, Normal)) * Normal
+ XMVECTOR Result = XMVector4Dot(Incident,Normal);
+ Result = _mm_add_ps(Result,Result);
+ Result = _mm_mul_ps(Result,Normal);
+ Result = _mm_sub_ps(Incident,Result);
+ return Result;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4Refract
+(
+ FXMVECTOR Incident,
+ FXMVECTOR Normal,
+ FLOAT RefractionIndex
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Index;
+ Index = XMVectorReplicate(RefractionIndex);
+ return XMVector4RefractV(Incident, Normal, Index);
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR Index = _mm_set_ps1(RefractionIndex);
+ return XMVector4RefractV(Incident,Normal,Index);
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4RefractV
+(
+ FXMVECTOR Incident,
+ FXMVECTOR Normal,
+ FXMVECTOR RefractionIndex
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR IDotN;
+ XMVECTOR R;
+ CONST XMVECTOR Zero = XMVectorZero();
+
+ // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
+ // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+
+ IDotN = XMVector4Dot(Incident, Normal);
+
+ // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+ R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
+ R = XMVectorMultiply(R, RefractionIndex);
+ R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
+
+ if (XMVector4LessOrEqual(R, Zero))
+ {
+ // Total internal reflection
+ return Zero;
+ }
+ else
+ {
+ XMVECTOR Result;
+
+ // R = RefractionIndex * IDotN + sqrt(R)
+ R = XMVectorSqrt(R);
+ R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
+
+ // Result = RefractionIndex * Incident - Normal * R
+ Result = XMVectorMultiply(RefractionIndex, Incident);
+ Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
+
+ return Result;
+ }
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
+ // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+
+ XMVECTOR IDotN = XMVector4Dot(Incident,Normal);
+
+ // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+ XMVECTOR R = _mm_mul_ps(IDotN,IDotN);
+ R = _mm_sub_ps(g_XMOne,R);
+ R = _mm_mul_ps(R, RefractionIndex);
+ R = _mm_mul_ps(R, RefractionIndex);
+ R = _mm_sub_ps(g_XMOne,R);
+
+ XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero);
+ if (_mm_movemask_ps(vResult)==0x0f)
+ {
+ // Total internal reflection
+ vResult = g_XMZero;
+ }
+ else
+ {
+ // R = RefractionIndex * IDotN + sqrt(R)
+ R = _mm_sqrt_ps(R);
+ vResult = _mm_mul_ps(RefractionIndex, IDotN);
+ R = _mm_add_ps(R,vResult);
+ // Result = RefractionIndex * Incident - Normal * R
+ vResult = _mm_mul_ps(RefractionIndex, Incident);
+ R = _mm_mul_ps(R,Normal);
+ vResult = _mm_sub_ps(vResult,R);
+ }
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4Orthogonal
+(
+ FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR Result;
+ Result.vector4_f32[0] = V.vector4_f32[2];
+ Result.vector4_f32[1] = V.vector4_f32[3];
+ Result.vector4_f32[2] = -V.vector4_f32[0];
+ Result.vector4_f32[3] = -V.vector4_f32[1];
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ static const XMVECTORF32 FlipZW = {1.0f,1.0f,-1.0f,-1.0f};
+ XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,0,3,2));
+ vResult = _mm_mul_ps(vResult,FlipZW);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4AngleBetweenNormalsEst
+(
+ FXMVECTOR N1,
+ FXMVECTOR N2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR NegativeOne;
+ XMVECTOR One;
+ XMVECTOR Result;
+
+ Result = XMVector4Dot(N1, N2);
+ NegativeOne = XMVectorSplatConstant(-1, 0);
+ One = XMVectorSplatOne();
+ Result = XMVectorClamp(Result, NegativeOne, One);
+ Result = XMVectorACosEst(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = XMVector4Dot(N1,N2);
+ // Clamp to -1.0f to 1.0f
+ vResult = _mm_max_ps(vResult,g_XMNegativeOne);
+ vResult = _mm_min_ps(vResult,g_XMOne);;
+ vResult = XMVectorACosEst(vResult);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4AngleBetweenNormals
+(
+ FXMVECTOR N1,
+ FXMVECTOR N2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR NegativeOne;
+ XMVECTOR One;
+ XMVECTOR Result;
+
+ Result = XMVector4Dot(N1, N2);
+ NegativeOne = XMVectorSplatConstant(-1, 0);
+ One = XMVectorSplatOne();
+ Result = XMVectorClamp(Result, NegativeOne, One);
+ Result = XMVectorACos(Result);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR vResult = XMVector4Dot(N1,N2);
+ // Clamp to -1.0f to 1.0f
+ vResult = _mm_max_ps(vResult,g_XMNegativeOne);
+ vResult = _mm_min_ps(vResult,g_XMOne);;
+ vResult = XMVectorACos(vResult);
+ return vResult;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4AngleBetweenVectors
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR L1;
+ XMVECTOR L2;
+ XMVECTOR Dot;
+ XMVECTOR CosAngle;
+ XMVECTOR NegativeOne;
+ XMVECTOR One;
+ XMVECTOR Result;
+
+ L1 = XMVector4ReciprocalLength(V1);
+ L2 = XMVector4ReciprocalLength(V2);
+
+ Dot = XMVector4Dot(V1, V2);
+
+ L1 = XMVectorMultiply(L1, L2);
+
+ CosAngle = XMVectorMultiply(Dot, L1);
+ NegativeOne = XMVectorSplatConstant(-1, 0);
+ One = XMVectorSplatOne();
+ CosAngle = XMVectorClamp(CosAngle, NegativeOne, One);
+
+ Result = XMVectorACos(CosAngle);
+
+ return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ XMVECTOR L1;
+ XMVECTOR L2;
+ XMVECTOR Dot;
+ XMVECTOR CosAngle;
+ XMVECTOR Result;
+
+ L1 = XMVector4ReciprocalLength(V1);
+ L2 = XMVector4ReciprocalLength(V2);
+ Dot = XMVector4Dot(V1, V2);
+ L1 = _mm_mul_ps(L1,L2);
+ CosAngle = _mm_mul_ps(Dot,L1);
+ CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne, g_XMOne);
+ Result = XMVectorACos(CosAngle);
+ return Result;
+
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR XMVector4Transform
+(
+ FXMVECTOR V,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+ FLOAT fX = (M.m[0][0]*V.vector4_f32[0])+(M.m[1][0]*V.vector4_f32[1])+(M.m[2][0]*V.vector4_f32[2])+(M.m[3][0]*V.vector4_f32[3]);
+ FLOAT fY = (M.m[0][1]*V.vector4_f32[0])+(M.m[1][1]*V.vector4_f32[1])+(M.m[2][1]*V.vector4_f32[2])+(M.m[3][1]*V.vector4_f32[3]);
+ FLOAT fZ = (M.m[0][2]*V.vector4_f32[0])+(M.m[1][2]*V.vector4_f32[1])+(M.m[2][2]*V.vector4_f32[2])+(M.m[3][2]*V.vector4_f32[3]);
+ FLOAT fW = (M.m[0][3]*V.vector4_f32[0])+(M.m[1][3]*V.vector4_f32[1])+(M.m[2][3]*V.vector4_f32[2])+(M.m[3][3]*V.vector4_f32[3]);
+ XMVECTOR vResult = {
+ fX,
+ fY,
+ fZ,
+ fW
+ };
+ return vResult;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ // Splat x,y,z and w
+ XMVECTOR vTempX = _mm_shuffle_ps(V,V,_MM_SHUFFLE(0,0,0,0));
+ XMVECTOR vTempY = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
+ XMVECTOR vTempZ = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
+ XMVECTOR vTempW = _mm_shuffle_ps(V,V,_MM_SHUFFLE(3,3,3,3));
+ // Mul by the matrix
+ vTempX = _mm_mul_ps(vTempX,M.r[0]);
+ vTempY = _mm_mul_ps(vTempY,M.r[1]);
+ vTempZ = _mm_mul_ps(vTempZ,M.r[2]);
+ vTempW = _mm_mul_ps(vTempW,M.r[3]);
+ // Add them all together
+ vTempX = _mm_add_ps(vTempX,vTempY);
+ vTempZ = _mm_add_ps(vTempZ,vTempW);
+ vTempX = _mm_add_ps(vTempX,vTempZ);
+ return vTempX;
+#else // _XM_VMX128_INTRINSICS_
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+XMINLINE XMFLOAT4* XMVector4TransformStream
+(
+ XMFLOAT4* pOutputStream,
+ size_t OutputStride,
+ CONST XMFLOAT4* pInputStream,
+ size_t InputStride,
+ size_t VectorCount,
+ CXMMATRIX M
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+ XMVECTOR V;
+ XMVECTOR X;
+ XMVECTOR Y;
+ XMVECTOR Z;
+ XMVECTOR W;
+ XMVECTOR Result;
+ size_t i;
+ CONST BYTE* pInputVector = (CONST BYTE*)pInputStream;
+ BYTE* pOutputVector = (BYTE*)pOutputStream;
+
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+
+ for (i = 0; i < VectorCount; i++)
+ {
+ V = XMLoadFloat4((const XMFLOAT4*)pInputVector);
+ W = XMVectorSplatW(V);
+ Z = XMVectorSplatZ(V);
+ Y = XMVectorSplatY(V);
+ X = XMVectorSplatX(V);
+// W = XMVectorReplicate(((XMFLOAT4*)pInputVector)->w);
+// Z = XMVectorReplicate(((XMFLOAT4*)pInputVector)->z);
+// Y = XMVectorReplicate(((XMFLOAT4*)pInputVector)->y);
+// X = XMVectorReplicate(((XMFLOAT4*)pInputVector)->x);
+
+ Result = XMVectorMultiply(W, M.r[3]);
+ Result = XMVectorMultiplyAdd(Z, M.r[2], Result);
+ Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+ Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+ XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
+
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+
+ return pOutputStream;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+ size_t i;
+
+ XMASSERT(pOutputStream);
+ XMASSERT(pInputStream);
+
+ const BYTE*pInputVector = reinterpret_cast<const BYTE *>(pInputStream);
+ BYTE* pOutputVector = reinterpret_cast<BYTE *>(pOutputStream);
+ for (i = 0; i < VectorCount; i++)
+ {
+ // Fetch the row and splat it
+ XMVECTOR vTempx = _mm_loadu_ps(reinterpret_cast<const float *>(pInputVector));
+ XMVECTOR vTempy = _mm_shuffle_ps(vTempx,vTempx,_MM_SHUFFLE(1,1,1,1));
+ XMVECTOR vTempz = _mm_shuffle_ps(vTempx,vTempx,_MM_SHUFFLE(2,2,2,2));
+ XMVECTOR vTempw = _mm_shuffle_ps(vTempx,vTempx,_MM_SHUFFLE(3,3,3,3));
+ vTempx = _mm_shuffle_ps(vTempx,vTempx,_MM_SHUFFLE(0,0,0,0));
+ vTempx = _mm_mul_ps(vTempx,M.r[0]);
+ vTempy = _mm_mul_ps(vTempy,M.r[1]);
+ vTempz = _mm_mul_ps(vTempz,M.r[2]);
+ vTempw = _mm_mul_ps(vTempw,M.r[3]);
+ vTempx = _mm_add_ps(vTempx,vTempy);
+ vTempw = _mm_add_ps(vTempw,vTempz);
+ vTempw = _mm_add_ps(vTempw,vTempx);
+ // Store the transformed vector
+ _mm_storeu_ps(reinterpret_cast<float *>(pOutputVector),vTempw);
+
+ pInputVector += InputStride;
+ pOutputVector += OutputStride;
+ }
+ return pOutputStream;
+#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
+#endif // _XM_VMX128_INTRINSICS_
+}
+
+#ifdef __cplusplus
+
+/****************************************************************************
+ *
+ * XMVECTOR operators
+ *
+ ****************************************************************************/
+
+#ifndef XM_NO_OPERATOR_OVERLOADS
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR operator+ (FXMVECTOR V)
+{
+ return V;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR operator- (FXMVECTOR V)
+{
+ return XMVectorNegate(V);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR& operator+=
+(
+ XMVECTOR& V1,
+ FXMVECTOR V2
+)
+{
+ V1 = XMVectorAdd(V1, V2);
+ return V1;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR& operator-=
+(
+ XMVECTOR& V1,
+ FXMVECTOR V2
+)
+{
+ V1 = XMVectorSubtract(V1, V2);
+ return V1;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR& operator*=
+(
+ XMVECTOR& V1,
+ FXMVECTOR V2
+)
+{
+ V1 = XMVectorMultiply(V1, V2);
+ return V1;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR& operator/=
+(
+ XMVECTOR& V1,
+ FXMVECTOR V2
+)
+{
+ V1 = XMVectorDivide(V1,V2);
+ return V1;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR& operator*=
+(
+ XMVECTOR& V,
+ CONST FLOAT S
+)
+{
+ V = XMVectorScale(V, S);
+ return V;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR& operator/=
+(
+ XMVECTOR& V,
+ CONST FLOAT S
+)
+{
+ V = XMVectorScale(V, 1.0f / S);
+ return V;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR operator+
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+ return XMVectorAdd(V1, V2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR operator-
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+ return XMVectorSubtract(V1, V2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR operator*
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+ return XMVectorMultiply(V1, V2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR operator/
+(
+ FXMVECTOR V1,
+ FXMVECTOR V2
+)
+{
+ return XMVectorDivide(V1,V2);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR operator*
+(
+ FXMVECTOR V,
+ CONST FLOAT S
+)
+{
+ return XMVectorScale(V, S);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR operator/
+(
+ FXMVECTOR V,
+ CONST FLOAT S
+)
+{
+ return XMVectorScale(V, 1.0f / S);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMVECTOR operator*
+(
+ FLOAT S,
+ FXMVECTOR V
+)
+{
+ return XMVectorScale(V, S);
+}
+
+#endif // !XM_NO_OPERATOR_OVERLOADS
+
+/****************************************************************************
+ *
+ * XMFLOAT2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT2::_XMFLOAT2
+(
+ CONST FLOAT* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT2& _XMFLOAT2::operator=
+(
+ CONST _XMFLOAT2& Float2
+)
+{
+ x = Float2.x;
+ y = Float2.y;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMFLOAT2A& XMFLOAT2A::operator=
+(
+ CONST XMFLOAT2A& Float2
+)
+{
+ x = Float2.x;
+ y = Float2.y;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMINT2 operators
+ *
+ ****************************************************************************/
+
+XMFINLINE _XMINT2::_XMINT2
+(
+ CONST INT *pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMINT2& _XMINT2::operator=
+(
+ CONST _XMINT2& Int2
+)
+{
+ x = Int2.x;
+ y = Int2.y;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUINT2 operators
+ *
+ ****************************************************************************/
+
+XMFINLINE _XMUINT2::_XMUINT2
+(
+ CONST UINT *pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMUINT2& _XMUINT2::operator=
+(
+ CONST _XMUINT2& UInt2
+)
+{
+ x = UInt2.x;
+ y = UInt2.y;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMHALF2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHALF2::_XMHALF2
+(
+ CONST HALF* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHALF2::_XMHALF2
+(
+ FLOAT _x,
+ FLOAT _y
+)
+{
+ x = XMConvertFloatToHalf(_x);
+ y = XMConvertFloatToHalf(_y);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHALF2::_XMHALF2
+(
+ CONST FLOAT* pArray
+)
+{
+ x = XMConvertFloatToHalf(pArray[0]);
+ y = XMConvertFloatToHalf(pArray[1]);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHALF2& _XMHALF2::operator=
+(
+ CONST _XMHALF2& Half2
+)
+{
+ x = Half2.x;
+ y = Half2.y;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMSHORTN2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORTN2::_XMSHORTN2
+(
+ CONST SHORT* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORTN2::_XMSHORTN2
+(
+ FLOAT _x,
+ FLOAT _y
+)
+{
+ XMStoreShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORTN2::_XMSHORTN2
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreShortN2(this, XMLoadFloat2((const XMFLOAT2*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORTN2& _XMSHORTN2::operator=
+(
+ CONST _XMSHORTN2& ShortN2
+)
+{
+ x = ShortN2.x;
+ y = ShortN2.y;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMSHORT2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORT2::_XMSHORT2
+(
+ CONST SHORT* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORT2::_XMSHORT2
+(
+ FLOAT _x,
+ FLOAT _y
+)
+{
+ XMStoreShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORT2::_XMSHORT2
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreShort2(this, XMLoadFloat2((const XMFLOAT2*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORT2& _XMSHORT2::operator=
+(
+ CONST _XMSHORT2& Short2
+)
+{
+ x = Short2.x;
+ y = Short2.y;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUSHORTN2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORTN2::_XMUSHORTN2
+(
+ CONST USHORT* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORTN2::_XMUSHORTN2
+(
+ FLOAT _x,
+ FLOAT _y
+)
+{
+ XMStoreUShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORTN2::_XMUSHORTN2
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUShortN2(this, XMLoadFloat2((const XMFLOAT2*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORTN2& _XMUSHORTN2::operator=
+(
+ CONST _XMUSHORTN2& UShortN2
+)
+{
+ x = UShortN2.x;
+ y = UShortN2.y;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUSHORT2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORT2::_XMUSHORT2
+(
+ CONST USHORT* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORT2::_XMUSHORT2
+(
+ FLOAT _x,
+ FLOAT _y
+)
+{
+ XMStoreUShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORT2::_XMUSHORT2
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUShort2(this, XMLoadFloat2((const XMFLOAT2*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORT2& _XMUSHORT2::operator=
+(
+ CONST _XMUSHORT2& UShort2
+)
+{
+ x = UShort2.x;
+ y = UShort2.y;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMBYTEN2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTEN2::_XMBYTEN2
+(
+ CONST CHAR* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTEN2::_XMBYTEN2
+(
+ FLOAT _x,
+ FLOAT _y
+)
+{
+ XMStoreByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTEN2::_XMBYTEN2
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreByteN2(this, XMLoadFloat2((const XMFLOAT2*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTEN2& _XMBYTEN2::operator=
+(
+ CONST _XMBYTEN2& ByteN2
+)
+{
+ x = ByteN2.x;
+ y = ByteN2.y;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMBYTE2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTE2::_XMBYTE2
+(
+ CONST CHAR* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTE2::_XMBYTE2
+(
+ FLOAT _x,
+ FLOAT _y
+)
+{
+ XMStoreByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTE2::_XMBYTE2
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreByte2(this, XMLoadFloat2((const XMFLOAT2*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTE2& _XMBYTE2::operator=
+(
+ CONST _XMBYTE2& Byte2
+)
+{
+ x = Byte2.x;
+ y = Byte2.y;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUBYTEN2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTEN2::_XMUBYTEN2
+(
+ CONST BYTE* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTEN2::_XMUBYTEN2
+(
+ FLOAT _x,
+ FLOAT _y
+)
+{
+ XMStoreUByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTEN2::_XMUBYTEN2
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUByteN2(this, XMLoadFloat2((const XMFLOAT2*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTEN2& _XMUBYTEN2::operator=
+(
+ CONST _XMUBYTEN2& UByteN2
+)
+{
+ x = UByteN2.x;
+ y = UByteN2.y;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUBYTE2 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTE2::_XMUBYTE2
+(
+ CONST BYTE* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTE2::_XMUBYTE2
+(
+ FLOAT _x,
+ FLOAT _y
+)
+{
+ XMStoreUByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTE2::_XMUBYTE2
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUByte2(this, XMLoadFloat2((const XMFLOAT2*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTE2& _XMUBYTE2::operator=
+(
+ CONST _XMUBYTE2& UByte2
+)
+{
+ x = UByte2.x;
+ y = UByte2.y;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT3 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT3::_XMFLOAT3
+(
+ CONST FLOAT* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT3& _XMFLOAT3::operator=
+(
+ CONST _XMFLOAT3& Float3
+)
+{
+ x = Float3.x;
+ y = Float3.y;
+ z = Float3.z;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMFLOAT3A& XMFLOAT3A::operator=
+(
+ CONST XMFLOAT3A& Float3
+)
+{
+ x = Float3.x;
+ y = Float3.y;
+ z = Float3.z;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMINT3 operators
+ *
+ ****************************************************************************/
+
+XMFINLINE _XMINT3::_XMINT3
+(
+ CONST INT *pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMINT3& _XMINT3::operator=
+(
+ CONST _XMINT3& Int3
+)
+{
+ x = Int3.x;
+ y = Int3.y;
+ z = Int3.z;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUINT3 operators
+ *
+ ****************************************************************************/
+
+XMFINLINE _XMUINT3::_XMUINT3
+(
+ CONST UINT *pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMUINT3& _XMUINT3::operator=
+(
+ CONST _XMUINT3& UInt3
+)
+{
+ x = UInt3.x;
+ y = UInt3.y;
+ z = UInt3.z;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMHENDN3 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHENDN3::_XMHENDN3
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z
+)
+{
+ XMStoreHenDN3(this, XMVectorSet(_x, _y, _z, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHENDN3::_XMHENDN3
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreHenDN3(this, XMLoadFloat3((const XMFLOAT3*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHENDN3& _XMHENDN3::operator=
+(
+ CONST _XMHENDN3& HenDN3
+)
+{
+ v = HenDN3.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHENDN3& _XMHENDN3::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMHEND3 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHEND3::_XMHEND3
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z
+)
+{
+ XMStoreHenD3(this, XMVectorSet(_x, _y, _z, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHEND3::_XMHEND3
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreHenD3(this, XMLoadFloat3((const XMFLOAT3*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHEND3& _XMHEND3::operator=
+(
+ CONST _XMHEND3& HenD3
+)
+{
+ v = HenD3.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHEND3& _XMHEND3::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUHENDN3 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUHENDN3::_XMUHENDN3
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z
+)
+{
+ XMStoreUHenDN3(this, XMVectorSet(_x, _y, _z, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUHENDN3::_XMUHENDN3
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUHenDN3(this, XMLoadFloat3((const XMFLOAT3*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUHENDN3& _XMUHENDN3::operator=
+(
+ CONST _XMUHENDN3& UHenDN3
+)
+{
+ v = UHenDN3.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUHENDN3& _XMUHENDN3::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUHEND3 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUHEND3::_XMUHEND3
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z
+)
+{
+ XMStoreUHenD3(this, XMVectorSet(_x, _y, _z, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUHEND3::_XMUHEND3
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUHenD3(this, XMLoadFloat3((const XMFLOAT3*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUHEND3& _XMUHEND3::operator=
+(
+ CONST _XMUHEND3& UHenD3
+)
+{
+ v = UHenD3.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUHEND3& _XMUHEND3::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMDHENN3 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDHENN3::_XMDHENN3
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z
+)
+{
+ XMStoreDHenN3(this, XMVectorSet(_x, _y, _z, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDHENN3::_XMDHENN3
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreDHenN3(this, XMLoadFloat3((const XMFLOAT3*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDHENN3& _XMDHENN3::operator=
+(
+ CONST _XMDHENN3& DHenN3
+)
+{
+ v = DHenN3.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDHENN3& _XMDHENN3::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMDHEN3 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDHEN3::_XMDHEN3
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z
+)
+{
+ XMStoreDHen3(this, XMVectorSet(_x, _y, _z, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDHEN3::_XMDHEN3
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreDHen3(this, XMLoadFloat3((const XMFLOAT3*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDHEN3& _XMDHEN3::operator=
+(
+ CONST _XMDHEN3& DHen3
+)
+{
+ v = DHen3.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDHEN3& _XMDHEN3::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUDHENN3 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDHENN3::_XMUDHENN3
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z
+)
+{
+ XMStoreUDHenN3(this, XMVectorSet(_x, _y, _z, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDHENN3::_XMUDHENN3
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUDHenN3(this, XMLoadFloat3((const XMFLOAT3*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDHENN3& _XMUDHENN3::operator=
+(
+ CONST _XMUDHENN3& UDHenN3
+)
+{
+ v = UDHenN3.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDHENN3& _XMUDHENN3::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUDHEN3 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDHEN3::_XMUDHEN3
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z
+)
+{
+ XMStoreUDHen3(this, XMVectorSet(_x, _y, _z, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDHEN3::_XMUDHEN3
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUDHen3(this, XMLoadFloat3((const XMFLOAT3*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDHEN3& _XMUDHEN3::operator=
+(
+ CONST _XMUDHEN3& UDHen3
+)
+{
+ v = UDHen3.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDHEN3& _XMUDHEN3::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMU565 operators
+ *
+ ****************************************************************************/
+
+XMFINLINE _XMU565::_XMU565
+(
+ CONST CHAR *pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+}
+
+XMFINLINE _XMU565::_XMU565
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z
+)
+{
+ XMStoreU565(this, XMVectorSet( _x, _y, _z, 0.0f ));
+}
+
+XMFINLINE _XMU565::_XMU565
+(
+ CONST FLOAT *pArray
+)
+{
+ XMStoreU565(this, XMLoadFloat3((const XMFLOAT3*)pArray ));
+}
+
+XMFINLINE _XMU565& _XMU565::operator=
+(
+ CONST _XMU565& U565
+)
+{
+ v = U565.v;
+ return *this;
+}
+
+XMFINLINE _XMU565& _XMU565::operator=
+(
+ CONST USHORT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT3PK operators
+ *
+ ****************************************************************************/
+
+XMFINLINE _XMFLOAT3PK::_XMFLOAT3PK
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z
+)
+{
+ XMStoreFloat3PK(this, XMVectorSet( _x, _y, _z, 0.0f ));
+}
+
+XMFINLINE _XMFLOAT3PK::_XMFLOAT3PK
+(
+ CONST FLOAT *pArray
+)
+{
+ XMStoreFloat3PK(this, XMLoadFloat3((const XMFLOAT3*)pArray ));
+}
+
+XMFINLINE _XMFLOAT3PK& _XMFLOAT3PK::operator=
+(
+ CONST _XMFLOAT3PK& float3pk
+)
+{
+ v = float3pk.v;
+ return *this;
+}
+
+XMFINLINE _XMFLOAT3PK& _XMFLOAT3PK::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT3SE operators
+ *
+ ****************************************************************************/
+
+XMFINLINE _XMFLOAT3SE::_XMFLOAT3SE
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z
+)
+{
+ XMStoreFloat3SE(this, XMVectorSet( _x, _y, _z, 0.0f ));
+}
+
+XMFINLINE _XMFLOAT3SE::_XMFLOAT3SE
+(
+ CONST FLOAT *pArray
+)
+{
+ XMStoreFloat3SE(this, XMLoadFloat3((const XMFLOAT3*)pArray ));
+}
+
+XMFINLINE _XMFLOAT3SE& _XMFLOAT3SE::operator=
+(
+ CONST _XMFLOAT3SE& float3se
+)
+{
+ v = float3se.v;
+ return *this;
+}
+
+XMFINLINE _XMFLOAT3SE& _XMFLOAT3SE::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT4::_XMFLOAT4
+(
+ CONST FLOAT* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+ w = pArray[3];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMFLOAT4& _XMFLOAT4::operator=
+(
+ CONST _XMFLOAT4& Float4
+)
+{
+ x = Float4.x;
+ y = Float4.y;
+ z = Float4.z;
+ w = Float4.w;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMFLOAT4A& XMFLOAT4A::operator=
+(
+ CONST XMFLOAT4A& Float4
+)
+{
+ x = Float4.x;
+ y = Float4.y;
+ z = Float4.z;
+ w = Float4.w;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMINT4 operators
+ *
+ ****************************************************************************/
+
+XMFINLINE _XMINT4::_XMINT4
+(
+ CONST INT *pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+ w = pArray[3];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMINT4& _XMINT4::operator=
+(
+ CONST _XMINT4& Int4
+)
+{
+ x = Int4.x;
+ y = Int4.y;
+ z = Int4.z;
+ w = Int4.w;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUINT4 operators
+ *
+ ****************************************************************************/
+
+XMFINLINE _XMUINT4::_XMUINT4
+(
+ CONST UINT *pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+ w = pArray[3];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE XMUINT4& _XMUINT4::operator=
+(
+ CONST _XMUINT4& UInt4
+)
+{
+ x = UInt4.x;
+ y = UInt4.y;
+ z = UInt4.z;
+ w = UInt4.w;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMHALF4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHALF4::_XMHALF4
+(
+ CONST HALF* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+ w = pArray[3];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHALF4::_XMHALF4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ x = XMConvertFloatToHalf(_x);
+ y = XMConvertFloatToHalf(_y);
+ z = XMConvertFloatToHalf(_z);
+ w = XMConvertFloatToHalf(_w);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHALF4::_XMHALF4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMConvertFloatToHalfStream(&x, sizeof(HALF), pArray, sizeof(FLOAT), 4);
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMHALF4& _XMHALF4::operator=
+(
+ CONST _XMHALF4& Half4
+)
+{
+ x = Half4.x;
+ y = Half4.y;
+ z = Half4.z;
+ w = Half4.w;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMSHORTN4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORTN4::_XMSHORTN4
+(
+ CONST SHORT* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+ w = pArray[3];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORTN4::_XMSHORTN4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreShortN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORTN4::_XMSHORTN4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreShortN4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORTN4& _XMSHORTN4::operator=
+(
+ CONST _XMSHORTN4& ShortN4
+)
+{
+ x = ShortN4.x;
+ y = ShortN4.y;
+ z = ShortN4.z;
+ w = ShortN4.w;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMSHORT4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORT4::_XMSHORT4
+(
+ CONST SHORT* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+ w = pArray[3];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORT4::_XMSHORT4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreShort4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORT4::_XMSHORT4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreShort4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMSHORT4& _XMSHORT4::operator=
+(
+ CONST _XMSHORT4& Short4
+)
+{
+ x = Short4.x;
+ y = Short4.y;
+ z = Short4.z;
+ w = Short4.w;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUSHORTN4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORTN4::_XMUSHORTN4
+(
+ CONST USHORT* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+ w = pArray[3];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORTN4::_XMUSHORTN4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreUShortN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORTN4::_XMUSHORTN4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUShortN4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORTN4& _XMUSHORTN4::operator=
+(
+ CONST _XMUSHORTN4& UShortN4
+)
+{
+ x = UShortN4.x;
+ y = UShortN4.y;
+ z = UShortN4.z;
+ w = UShortN4.w;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUSHORT4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORT4::_XMUSHORT4
+(
+ CONST USHORT* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+ w = pArray[3];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORT4::_XMUSHORT4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreUShort4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORT4::_XMUSHORT4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUShort4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUSHORT4& _XMUSHORT4::operator=
+(
+ CONST _XMUSHORT4& UShort4
+)
+{
+ x = UShort4.x;
+ y = UShort4.y;
+ z = UShort4.z;
+ w = UShort4.w;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMXDECN4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXDECN4::_XMXDECN4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreXDecN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXDECN4::_XMXDECN4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreXDecN4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXDECN4& _XMXDECN4::operator=
+(
+ CONST _XMXDECN4& XDecN4
+)
+{
+ v = XDecN4.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXDECN4& _XMXDECN4::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMXDEC4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXDEC4::_XMXDEC4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreXDec4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXDEC4::_XMXDEC4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreXDec4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXDEC4& _XMXDEC4::operator=
+(
+ CONST _XMXDEC4& XDec4
+)
+{
+ v = XDec4.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXDEC4& _XMXDEC4::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMDECN4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDECN4::_XMDECN4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreDecN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDECN4::_XMDECN4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreDecN4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDECN4& _XMDECN4::operator=
+(
+ CONST _XMDECN4& DecN4
+)
+{
+ v = DecN4.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDECN4& _XMDECN4::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMDEC4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDEC4::_XMDEC4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreDec4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDEC4::_XMDEC4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreDec4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDEC4& _XMDEC4::operator=
+(
+ CONST _XMDEC4& Dec4
+)
+{
+ v = Dec4.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMDEC4& _XMDEC4::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUDECN4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDECN4::_XMUDECN4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreUDecN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDECN4::_XMUDECN4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUDecN4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDECN4& _XMUDECN4::operator=
+(
+ CONST _XMUDECN4& UDecN4
+)
+{
+ v = UDecN4.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDECN4& _XMUDECN4::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUDEC4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDEC4::_XMUDEC4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreUDec4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDEC4::_XMUDEC4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUDec4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDEC4& _XMUDEC4::operator=
+(
+ CONST _XMUDEC4& UDec4
+)
+{
+ v = UDec4.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUDEC4& _XMUDEC4::operator=
+(
+ CONST UINT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMXICON4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXICON4::_XMXICON4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreXIcoN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXICON4::_XMXICON4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreXIcoN4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXICON4& _XMXICON4::operator=
+(
+ CONST _XMXICON4& XIcoN4
+)
+{
+ v = XIcoN4.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXICON4& _XMXICON4::operator=
+(
+ CONST UINT64 Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMXICO4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXICO4::_XMXICO4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreXIco4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXICO4::_XMXICO4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreXIco4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXICO4& _XMXICO4::operator=
+(
+ CONST _XMXICO4& XIco4
+)
+{
+ v = XIco4.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMXICO4& _XMXICO4::operator=
+(
+ CONST UINT64 Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMICON4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMICON4::_XMICON4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreIcoN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMICON4::_XMICON4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreIcoN4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMICON4& _XMICON4::operator=
+(
+ CONST _XMICON4& IcoN4
+)
+{
+ v = IcoN4.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMICON4& _XMICON4::operator=
+(
+ CONST UINT64 Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMICO4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMICO4::_XMICO4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreIco4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMICO4::_XMICO4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreIco4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMICO4& _XMICO4::operator=
+(
+ CONST _XMICO4& Ico4
+)
+{
+ v = Ico4.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMICO4& _XMICO4::operator=
+(
+ CONST UINT64 Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUICON4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUICON4::_XMUICON4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreUIcoN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUICON4::_XMUICON4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUIcoN4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUICON4& _XMUICON4::operator=
+(
+ CONST _XMUICON4& UIcoN4
+)
+{
+ v = UIcoN4.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUICON4& _XMUICON4::operator=
+(
+ CONST UINT64 Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUICO4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUICO4::_XMUICO4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreUIco4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUICO4::_XMUICO4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUIco4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUICO4& _XMUICO4::operator=
+(
+ CONST _XMUICO4& UIco4
+)
+{
+ v = UIco4.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUICO4& _XMUICO4::operator=
+(
+ CONST UINT64 Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMCOLOR4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMCOLOR::_XMCOLOR
+(
+ FLOAT _r,
+ FLOAT _g,
+ FLOAT _b,
+ FLOAT _a
+)
+{
+ XMStoreColor(this, XMVectorSet(_r, _g, _b, _a));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMCOLOR::_XMCOLOR
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreColor(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMCOLOR& _XMCOLOR::operator=
+(
+ CONST _XMCOLOR& Color
+)
+{
+ c = Color.c;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMCOLOR& _XMCOLOR::operator=
+(
+ CONST UINT Color
+)
+{
+ c = Color;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMBYTEN4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTEN4::_XMBYTEN4
+(
+ CONST CHAR* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+ w = pArray[3];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTEN4::_XMBYTEN4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreByteN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTEN4::_XMBYTEN4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreByteN4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTEN4& _XMBYTEN4::operator=
+(
+ CONST _XMBYTEN4& ByteN4
+)
+{
+ x = ByteN4.x;
+ y = ByteN4.y;
+ z = ByteN4.z;
+ w = ByteN4.w;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMBYTE4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTE4::_XMBYTE4
+(
+ CONST CHAR* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+ w = pArray[3];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTE4::_XMBYTE4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreByte4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTE4::_XMBYTE4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreByte4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMBYTE4& _XMBYTE4::operator=
+(
+ CONST _XMBYTE4& Byte4
+)
+{
+ x = Byte4.x;
+ y = Byte4.y;
+ z = Byte4.z;
+ w = Byte4.w;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUBYTEN4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTEN4::_XMUBYTEN4
+(
+ CONST BYTE* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+ w = pArray[3];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTEN4::_XMUBYTEN4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreUByteN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTEN4::_XMUBYTEN4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUByteN4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTEN4& _XMUBYTEN4::operator=
+(
+ CONST _XMUBYTEN4& UByteN4
+)
+{
+ x = UByteN4.x;
+ y = UByteN4.y;
+ z = UByteN4.z;
+ w = UByteN4.w;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUBYTE4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTE4::_XMUBYTE4
+(
+ CONST BYTE* pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+ w = pArray[3];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTE4::_XMUBYTE4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreUByte4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTE4::_XMUBYTE4
+(
+ CONST FLOAT* pArray
+)
+{
+ XMStoreUByte4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUBYTE4& _XMUBYTE4::operator=
+(
+ CONST _XMUBYTE4& UByte4
+)
+{
+ x = UByte4.x;
+ y = UByte4.y;
+ z = UByte4.z;
+ w = UByte4.w;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMUNIBBLE4 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUNIBBLE4::_XMUNIBBLE4
+(
+ CONST CHAR *pArray
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+ w = pArray[3];
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUNIBBLE4::_XMUNIBBLE4
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ FLOAT _w
+)
+{
+ XMStoreUNibble4(this, XMVectorSet( _x, _y, _z, _w ));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUNIBBLE4::_XMUNIBBLE4
+(
+ CONST FLOAT *pArray
+)
+{
+ XMStoreUNibble4(this, XMLoadFloat4((const XMFLOAT4*)pArray));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUNIBBLE4& _XMUNIBBLE4::operator=
+(
+ CONST _XMUNIBBLE4& UNibble4
+)
+{
+ v = UNibble4.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMUNIBBLE4& _XMUNIBBLE4::operator=
+(
+ CONST USHORT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+/****************************************************************************
+ *
+ * XMU555 operators
+ *
+ ****************************************************************************/
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMU555::_XMU555
+(
+ CONST CHAR *pArray,
+ BOOL _w
+)
+{
+ x = pArray[0];
+ y = pArray[1];
+ z = pArray[2];
+ w = _w;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMU555::_XMU555
+(
+ FLOAT _x,
+ FLOAT _y,
+ FLOAT _z,
+ BOOL _w
+)
+{
+ XMStoreU555(this, XMVectorSet(_x, _y, _z, ((_w) ? 1.0f : 0.0f) ));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMU555::_XMU555
+(
+ CONST FLOAT *pArray,
+ BOOL _w
+)
+{
+ XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pArray);
+ XMStoreU555(this, XMVectorSetW(V, ((_w) ? 1.0f : 0.0f) ));
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMU555& _XMU555::operator=
+(
+ CONST _XMU555& U555
+)
+{
+ v = U555.v;
+ return *this;
+}
+
+//------------------------------------------------------------------------------
+
+XMFINLINE _XMU555& _XMU555::operator=
+(
+ CONST USHORT Packed
+)
+{
+ v = Packed;
+ return *this;
+}
+
+#endif // __cplusplus
+
+#if defined(_XM_NO_INTRINSICS_)
+#undef XMISNAN
+#undef XMISINF
+#endif
+
+#endif // __XNAMATHVECTOR_INL__
+
diff --git a/thirdparty/directxtex/mingw/guid.cpp b/thirdparty/directxtex/mingw/guid.cpp
new file mode 100644
index 00000000..9a1af58b
--- /dev/null
+++ b/thirdparty/directxtex/mingw/guid.cpp
@@ -0,0 +1,5 @@
+#define INITGUID
+
+#include <windows.h>
+#include "sal.h"
+
diff --git a/thirdparty/directxtex/mingw/sal.h b/thirdparty/directxtex/mingw/sal.h
new file mode 100644
index 00000000..2ecbfc20
--- /dev/null
+++ b/thirdparty/directxtex/mingw/sal.h
@@ -0,0 +1,294 @@
+/**************************************************************************
+ *
+ * Copyright 2012 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef _SAL_H_
+#define _SAL_H_
+
+#ifndef __MINGW32__
+#error "This header should only be included for MinGW"
+#endif
+
+
+/*
+ * Several of the defines in compat.h clash with libstdc++ internal variables,
+ * so include the headers now, before they are re-defined.
+ */
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <vector>
+
+
+#include "compat.h"
+
+
+#define memcpy_s(dest, numberOfElements, src, count) memcpy(dest, src, count)
+
+#undef __uuidof
+#define __uuidof(_iface) IID_##_iface
+
+
+#ifndef ERROR_FILE_TOO_LARGE
+#define ERROR_FILE_TOO_LARGE 223L
+#endif
+
+
+#ifdef __GNUC__
+static inline bool _isnan(float x)
+{
+ return x != x;
+}
+#endif
+
+
+#include <wincodec.h>
+
+
+//DEFINE_GUID(GUID_WICPixelFormatDontCare, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x00);
+//DEFINE_GUID(GUID_WICPixelFormat1bppIndexed, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x01);
+//DEFINE_GUID(GUID_WICPixelFormat2bppIndexed, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x02);
+//DEFINE_GUID(GUID_WICPixelFormat4bppIndexed, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x03);
+//DEFINE_GUID(GUID_WICPixelFormat8bppIndexed, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x04);
+//DEFINE_GUID(GUID_WICPixelFormatBlackWhite, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x05);
+//DEFINE_GUID(GUID_WICPixelFormat2bppGray, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x06);
+//DEFINE_GUID(GUID_WICPixelFormat4bppGray, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x07);
+//DEFINE_GUID(GUID_WICPixelFormat8bppGray, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x08);
+DEFINE_GUID(GUID_WICPixelFormat8bppAlpha, 0xe6cd0116, 0xeeba, 0x4161, 0xaa, 0x85, 0x27, 0xdd, 0x9f, 0xb3, 0xa8, 0x95);
+//DEFINE_GUID(GUID_WICPixelFormat16bppBGR555, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x09);
+//DEFINE_GUID(GUID_WICPixelFormat16bppBGR565, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0a);
+//DEFINE_GUID(GUID_WICPixelFormat16bppBGRA5551, 0x05ec7c2b, 0xf1e6, 0x4961, 0xad, 0x46, 0xe1, 0xcc, 0x81, 0x0a, 0x87, 0xd2);
+//DEFINE_GUID(GUID_WICPixelFormat16bppGray, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0b);
+//DEFINE_GUID(GUID_WICPixelFormat24bppBGR, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0c);
+DEFINE_GUID(GUID_WICPixelFormat24bppRGB, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0d);
+//DEFINE_GUID(GUID_WICPixelFormat32bppBGR, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0e);
+//DEFINE_GUID(GUID_WICPixelFormat32bppBGRA, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0f);
+//DEFINE_GUID(GUID_WICPixelFormat32bppPBGRA, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x10);
+DEFINE_GUID(GUID_WICPixelFormat32bppGrayFloat, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x11);
+DEFINE_GUID(GUID_WICPixelFormat32bppRGBA, 0xf5c7ad2d, 0x6a8d, 0x43dd, 0xa7, 0xa8, 0xa2, 0x99, 0x35, 0x26, 0x1a, 0xe9);
+DEFINE_GUID(GUID_WICPixelFormat32bppPRGBA, 0x3cc4a650, 0xa527, 0x4d37, 0xa9, 0x16, 0x31, 0x42, 0xc7, 0xeb, 0xed, 0xba);
+//DEFINE_GUID(GUID_WICPixelFormat48bppRGB, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x15);
+DEFINE_GUID(GUID_WICPixelFormat48bppBGR, 0xe605a384, 0xb468, 0x46ce, 0xbb, 0x2e, 0x36, 0xf1, 0x80, 0xe6, 0x43, 0x13);
+//DEFINE_GUID(GUID_WICPixelFormat64bppRGBA, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x16);
+DEFINE_GUID(GUID_WICPixelFormat64bppBGRA, 0x1562ff7c, 0xd352, 0x46f9, 0x97, 0x9e, 0x42, 0x97, 0x6b, 0x79, 0x22, 0x46);
+//DEFINE_GUID(GUID_WICPixelFormat64bppPRGBA, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x17);
+DEFINE_GUID(GUID_WICPixelFormat64bppPBGRA, 0x8c518e8e, 0xa4ec, 0x468b, 0xae, 0x70, 0xc9, 0xa3, 0x5a, 0x9c, 0x55, 0x30);
+DEFINE_GUID(GUID_WICPixelFormat16bppGrayFixedPoint, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x13);
+DEFINE_GUID(GUID_WICPixelFormat32bppBGR101010, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x14);
+DEFINE_GUID(GUID_WICPixelFormat48bppRGBFixedPoint, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x12);
+DEFINE_GUID(GUID_WICPixelFormat48bppBGRFixedPoint, 0x49ca140e, 0xcab6, 0x493b, 0x9d, 0xdf, 0x60, 0x18, 0x7c, 0x37, 0x53, 0x2a);
+DEFINE_GUID(GUID_WICPixelFormat96bppRGBFixedPoint, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x18);
+DEFINE_GUID(GUID_WICPixelFormat128bppRGBAFloat, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x19);
+DEFINE_GUID(GUID_WICPixelFormat128bppPRGBAFloat, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x1a);
+DEFINE_GUID(GUID_WICPixelFormat128bppRGBFloat, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x1b);
+//DEFINE_GUID(GUID_WICPixelFormat32bppCMYK, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x1c);
+DEFINE_GUID(GUID_WICPixelFormat64bppRGBAFixedPoint, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x1d);
+DEFINE_GUID(GUID_WICPixelFormat64bppBGRAFixedPoint, 0x356de33c, 0x54d2, 0x4a23, 0xbb, 0x4, 0x9b, 0x7b, 0xf9, 0xb1, 0xd4, 0x2d);
+DEFINE_GUID(GUID_WICPixelFormat64bppRGBFixedPoint, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x40);
+DEFINE_GUID(GUID_WICPixelFormat128bppRGBAFixedPoint, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x1e);
+DEFINE_GUID(GUID_WICPixelFormat128bppRGBFixedPoint, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x41);
+DEFINE_GUID(GUID_WICPixelFormat64bppRGBAHalf, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x3a);
+DEFINE_GUID(GUID_WICPixelFormat64bppRGBHalf, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x42);
+DEFINE_GUID(GUID_WICPixelFormat48bppRGBHalf, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x3b);
+DEFINE_GUID(GUID_WICPixelFormat32bppRGBE, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x3d);
+DEFINE_GUID(GUID_WICPixelFormat16bppGrayHalf, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x3e);
+DEFINE_GUID(GUID_WICPixelFormat32bppGrayFixedPoint, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x3f);
+DEFINE_GUID(GUID_WICPixelFormat32bppRGBA1010102, 0x25238D72, 0xFCF9, 0x4522, 0xb5, 0x14, 0x55, 0x78, 0xe5, 0xad, 0x55, 0xe0);
+DEFINE_GUID(GUID_WICPixelFormat32bppRGBA1010102XR, 0x00DE6B9A, 0xC101, 0x434b, 0xb5, 0x02, 0xd0, 0x16, 0x5e, 0xe1, 0x12, 0x2c);
+DEFINE_GUID(GUID_WICPixelFormat64bppCMYK, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x1f);
+DEFINE_GUID(GUID_WICPixelFormat24bpp3Channels, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x20);
+DEFINE_GUID(GUID_WICPixelFormat32bpp4Channels, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x21);
+DEFINE_GUID(GUID_WICPixelFormat40bpp5Channels, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x22);
+DEFINE_GUID(GUID_WICPixelFormat48bpp6Channels, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x23);
+DEFINE_GUID(GUID_WICPixelFormat56bpp7Channels, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x24);
+DEFINE_GUID(GUID_WICPixelFormat64bpp8Channels, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x25);
+DEFINE_GUID(GUID_WICPixelFormat48bpp3Channels, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x26);
+DEFINE_GUID(GUID_WICPixelFormat64bpp4Channels, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x27);
+DEFINE_GUID(GUID_WICPixelFormat80bpp5Channels, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x28);
+DEFINE_GUID(GUID_WICPixelFormat96bpp6Channels, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x29);
+DEFINE_GUID(GUID_WICPixelFormat112bpp7Channels, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x2a);
+DEFINE_GUID(GUID_WICPixelFormat128bpp8Channels, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x2b);
+DEFINE_GUID(GUID_WICPixelFormat40bppCMYKAlpha, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x2c);
+DEFINE_GUID(GUID_WICPixelFormat80bppCMYKAlpha, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x2d);
+DEFINE_GUID(GUID_WICPixelFormat32bpp3ChannelsAlpha, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x2e);
+DEFINE_GUID(GUID_WICPixelFormat40bpp4ChannelsAlpha, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x2f);
+DEFINE_GUID(GUID_WICPixelFormat48bpp5ChannelsAlpha, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x30);
+DEFINE_GUID(GUID_WICPixelFormat56bpp6ChannelsAlpha, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x31);
+DEFINE_GUID(GUID_WICPixelFormat64bpp7ChannelsAlpha, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x32);
+DEFINE_GUID(GUID_WICPixelFormat72bpp8ChannelsAlpha, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x33);
+DEFINE_GUID(GUID_WICPixelFormat64bpp3ChannelsAlpha, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x34);
+DEFINE_GUID(GUID_WICPixelFormat80bpp4ChannelsAlpha, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x35);
+DEFINE_GUID(GUID_WICPixelFormat96bpp5ChannelsAlpha, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x36);
+DEFINE_GUID(GUID_WICPixelFormat112bpp6ChannelsAlpha, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x37);
+DEFINE_GUID(GUID_WICPixelFormat128bpp7ChannelsAlpha, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x38);
+DEFINE_GUID(GUID_WICPixelFormat144bpp8ChannelsAlpha, 0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x39);
+
+
+#ifndef __IWICPixelFormatInfo_INTERFACE_DEFINED__
+#define __IWICPixelFormatInfo_INTERFACE_DEFINED__
+
+DEFINE_GUID(IID_IWICPixelFormatInfo, 0xE8EDA601, 0x3D48, 0x431a, 0xAB, 0x44, 0x69, 0x05, 0x9B, 0xE8, 0x8B, 0xBE);
+
+#if defined(__cplusplus) && !defined(CINTERFACE)
+
+MIDL_INTERFACE("e8eda601-3d48-431a-ab4469059be88bbe")
+IWICPixelFormatInfo : public IWICComponentInfo
+{
+ virtual HRESULT STDMETHODCALLTYPE GetFormatGUID(
+ GUID *pFormat) = 0;
+
+ virtual HRESULT STDMETHODCALLTYPE GetColorContext(
+ IWICColorContext **ppIColorContext) = 0;
+
+ virtual HRESULT STDMETHODCALLTYPE GetBitsPerPixel(
+ UINT *puiBitsPerPixel) = 0;
+
+ virtual HRESULT STDMETHODCALLTYPE GetChannelCount(
+ UINT *puiChannelCount) = 0;
+
+ virtual HRESULT STDMETHODCALLTYPE GetChannelMask(
+ UINT uiChannelIndex,
+ UINT cbMaskBuffer,
+ BYTE *pbMaskBuffer,
+ UINT *pcbActual) = 0;
+};
+
+#else
+
+typedef struct IWICPixelFormatInfoVtbl {
+ BEGIN_INTERFACE
+
+ /*** IUnknown methods ***/
+ HRESULT (STDMETHODCALLTYPE *QueryInterface)(
+ IWICPixelFormatInfo* This,
+ REFIID riid,
+ void **ppvObject);
+
+ ULONG (STDMETHODCALLTYPE *AddRef)(
+ IWICPixelFormatInfo* This);
+
+ ULONG (STDMETHODCALLTYPE *Release)(
+ IWICPixelFormatInfo* This);
+
+ /*** IWICComponentInfo methods ***/
+ HRESULT (STDMETHODCALLTYPE *GetComponentType)(
+ IWICPixelFormatInfo* This,
+ WICComponentType *pType);
+
+ HRESULT (STDMETHODCALLTYPE *GetCLSID)(
+ IWICPixelFormatInfo* This,
+ CLSID *pclsid);
+
+ HRESULT (STDMETHODCALLTYPE *GetSigningStatus)(
+ IWICPixelFormatInfo* This,
+ DWORD *pStatus);
+
+ HRESULT (STDMETHODCALLTYPE *GetAuthor)(
+ IWICPixelFormatInfo* This,
+ UINT cchAuthor,
+ WCHAR *wzAuthor,
+ UINT *pcchActual);
+
+ HRESULT (STDMETHODCALLTYPE *GetVendorGUID)(
+ IWICPixelFormatInfo* This,
+ GUID *pguidVendor);
+
+ HRESULT (STDMETHODCALLTYPE *GetVersion)(
+ IWICPixelFormatInfo* This,
+ UINT cchVersion,
+ WCHAR *wzVersion,
+ UINT *pcchActual);
+
+ HRESULT (STDMETHODCALLTYPE *GetSpecVersion)(
+ IWICPixelFormatInfo* This,
+ UINT cchSpecVersion,
+ WCHAR *wzSpecVersion,
+ UINT *pcchActual);
+
+ HRESULT (STDMETHODCALLTYPE *GetFriendlyName)(
+ IWICPixelFormatInfo* This,
+ UINT cchFriendlyName,
+ WCHAR *wzFriendlyName,
+ UINT *pcchActual);
+
+ /*** IWICPixelFormatInfo methods ***/
+ HRESULT (STDMETHODCALLTYPE *GetFormatGUID)(
+ IWICPixelFormatInfo* This,
+ GUID *pFormat);
+
+ HRESULT (STDMETHODCALLTYPE *GetColorContext)(
+ IWICPixelFormatInfo* This,
+ IWICColorContext **ppIColorContext);
+
+ HRESULT (STDMETHODCALLTYPE *GetBitsPerPixel)(
+ IWICPixelFormatInfo* This,
+ UINT *puiBitsPerPixel);
+
+ HRESULT (STDMETHODCALLTYPE *GetChannelCount)(
+ IWICPixelFormatInfo* This,
+ UINT *puiChannelCount);
+
+ HRESULT (STDMETHODCALLTYPE *GetChannelMask)(
+ IWICPixelFormatInfo* This,
+ UINT uiChannelIndex,
+ UINT cbMaskBuffer,
+ BYTE *pbMaskBuffer,
+ UINT *pcbActual);
+
+ END_INTERFACE
+} IWICPixelFormatInfoVtbl;
+interface IWICPixelFormatInfo {
+ CONST_VTBL struct IWICPixelFormatInfoVtbl *lpVtbl;
+};
+
+#ifdef COBJMACROS
+/*** IUnknow nmethods ***/
+#define IWICPixelFormatInfo_QueryInterface(This,riid,ppvObject) ((This)->lpVtbl->QueryInterface(This,riid,ppvObject))
+#define IWICPixelFormatInfo_AddRef(This) ((This)->lpVtbl->AddRef(This))
+#define IWICPixelFormatInfo_Release(This) ((This)->lpVtbl->Release(This))
+/*** IWICComponentInfo methods ***/
+#define IWICPixelFormatInfo_GetComponentType(This,pType) ((This)->lpVtbl->GetComponentType(This,pType))
+#define IWICPixelFormatInfo_GetCLSID(This,pclsid) ((This)->lpVtbl->GetCLSID(This,pclsid))
+#define IWICPixelFormatInfo_GetSigningStatus(This,pStatus) ((This)->lpVtbl->GetSigningStatus(This,pStatus))
+#define IWICPixelFormatInfo_GetAuthor(This,cchAuthor,wzAuthor,pcchActual) ((This)->lpVtbl->GetAuthor(This,cchAuthor,wzAuthor,pcchActual))
+#define IWICPixelFormatInfo_GetVendorGUID(This,pguidVendor) ((This)->lpVtbl->GetVendorGUID(This,pguidVendor))
+#define IWICPixelFormatInfo_GetVersion(This,cchVersion,wzVersion,pcchActual) ((This)->lpVtbl->GetVersion(This,cchVersion,wzVersion,pcchActual))
+#define IWICPixelFormatInfo_GetSpecVersion(This,cchSpecVersion,wzSpecVersion,pcchActual) ((This)->lpVtbl->GetSpecVersion(This,cchSpecVersion,wzSpecVersion,pcchActual))
+#define IWICPixelFormatInfo_GetFriendlyName(This,cchFriendlyName,wzFriendlyName,pcchActual) ((This)->lpVtbl->GetFriendlyName(This,cchFriendlyName,wzFriendlyName,pcchActual))
+/*** IWICPixelFormatInfo methods ***/
+#define IWICPixelFormatInfo_GetFormatGUID(This,pFormat) ((This)->lpVtbl->GetFormatGUID(This,pFormat))
+#define IWICPixelFormatInfo_GetColorContext(This,ppIColorContext) ((This)->lpVtbl->GetColorContext(This,ppIColorContext))
+#define IWICPixelFormatInfo_GetBitsPerPixel(This,puiBitsPerPixel) ((This)->lpVtbl->GetBitsPerPixel(This,puiBitsPerPixel))
+#define IWICPixelFormatInfo_GetChannelCount(This,puiChannelCount) ((This)->lpVtbl->GetChannelCount(This,puiChannelCount))
+#define IWICPixelFormatInfo_GetChannelMask(This,uiChannelIndex,cbMaskBuffer,pbMaskBuffer,pcbActual) ((This)->lpVtbl->GetChannelMask(This,uiChannelIndex,cbMaskBuffer,pbMaskBuffer,pcbActual))
+#endif
+
+#endif
+
+#endif /* __IWICPixelFormatInfo_INTERFACE_DEFINED__ */
+
+
+#endif /* _SAL_H_ */