5 files changed, 723 insertions, 0 deletions
diff --git a/src/arch/x86-64/Makefile.am b/src/arch/x86-64/Makefile.am
new file mode 100644
index 0000000..ad4c4c8
--- /dev/null
+++ b/src/arch/x86-64/Makefile.am
@@ -0,0 +1,40 @@
+# Copyright ÂŠ 2012 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+if HAVE_X86_64_ASM
+
+AM_CPPFLAGS = \
+	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src/mesa \
+	-I$(top_srcdir)/src/GLdispatch/mapi \
+	$(API_DEFINES) \
+	$(DEFINES)
+
+noinst_PROGRAMS = gen_matypes
+
+gen_matypes_SOURCES = ../x86/gen_matypes.c
+BUILT_SOURCES = matypes.h
+CLEANFILES = matypes.h
+
+matypes.h: gen_matypes
+	$(AM_V_GEN)./gen_matypes > $@
+
+endif
diff --git a/src/arch/x86-64/calling_convention.txt b/src/arch/x86-64/calling_convention.txt
new file mode 100644
index 0000000..4147f7e
--- /dev/null
+++ b/src/arch/x86-64/calling_convention.txt
@@ -0,0 +1,50 @@
+Register Usage
+rax      temporary register; with variable arguments passes information
+         about the number of SSE registers used; 1st return register
+
+rbx*     callee-saved register; optionally used as base pointer
+
+rcx      used to pass 4th integer argument to functions
+
+rdx      used to pass 3rd argument to functions 2nd return register
+
+rsp*     stack pointer
+
+rbp*     callee-saved register; optionally used as frame pointer
+
+rsi      used to pass 2nd argument to functions
+
+rdi      used to pass 1st argument to functions
+
+r8       used to pass 5th argument to functions
+
+r9       used to pass 6th argument to functions
+
+r10      temporary register, used for passing a function's static chain pointer
+
+r11      temporary register
+
+r12-15*  callee-saved registers
+
+xmm01   used to pass and return floating point arguments
+
+xmm27   used to pass floating point arguments
+
+xmm815  temporary registers
+
+mmx07   temporary registers
+
+st0      temporary register; used to return long double arguments
+
+st1      temporary registers; used to return long double arguments
+
+st27    temporary registers
+
+fs       Reserved for system use (as thread specific data register)
+
+	
+
+*) must be preserved across function calls
+
+Integer arguments from list: rdi,rsi,rdx,rcx,r8,r9,stack
+Floating point arguments from list: xmm0-xmm7
+\ No newline at end of file
diff --git a/src/arch/x86-64/x86-64.c b/src/arch/x86-64/x86-64.c
new file mode 100644
index 0000000..10564d9
--- /dev/null
+++ b/src/arch/x86-64/x86-64.c
@@ -0,0 +1,119 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.3
+ *
+ * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * x86-64 optimizations shamelessy converted from x86/sse/3dnow assembly by
+ * Mikko Tiihonen
+ */
+
+#ifdef USE_X86_64_ASM
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "math/m_xform.h"
+#include "tnl/t_context.h"
+#include "x86-64.h"
+#include "../x86/x86_xform.h"
+
+#ifdef DEBUG
+#include "math/m_debug.h"
+#endif
+
+extern void _mesa_x86_64_cpuid(unsigned int *regs);
+
+DECLARE_XFORM_GROUP( x86_64, 4 )
+DECLARE_XFORM_GROUP( 3dnow, 4 )
+
+#else
+/* just to silence warning below */
+#include "x86-64.h"
+#endif
+
+/*
+extern void _mesa_x86_64_transform_points4_general( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_identity( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_perspective( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_3d( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_3d_no_rot( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_2d_no_rot( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_2d( XFORM_ARGS );
+*/
+
+#ifdef USE_X86_64_ASM
+static void message( const char *msg )
+{
+   if (_mesa_getenv("MESA_DEBUG")) {
+      _mesa_debug( NULL, "%s", msg );
+   }
+}
+#endif
+
+
+void _mesa_init_all_x86_64_transform_asm(void)
+{
+#ifdef USE_X86_64_ASM
+   unsigned int regs[4];
+
+   if ( _mesa_getenv( "MESA_NO_ASM" ) ) {
+     return;
+   }
+
+   message("Initializing x86-64 optimizations\n");
+
+
+   _mesa_transform_tab[4][MATRIX_GENERAL] =
+      _mesa_x86_64_transform_points4_general;
+   _mesa_transform_tab[4][MATRIX_IDENTITY] =
+      _mesa_x86_64_transform_points4_identity;
+   _mesa_transform_tab[4][MATRIX_3D] =
+      _mesa_x86_64_transform_points4_3d;
+
+   regs[0] = 0x80000001;
+   regs[1] = 0x00000000;
+   regs[2] = 0x00000000;
+   regs[3] = 0x00000000;
+   _mesa_x86_64_cpuid(regs);
+   if (regs[3] & (1U << 31)) {
+      message("3Dnow! detected\n");
+      _mesa_transform_tab[4][MATRIX_3D_NO_ROT] =
+	  _mesa_3dnow_transform_points4_3d_no_rot;
+      _mesa_transform_tab[4][MATRIX_PERSPECTIVE] =
+	  _mesa_3dnow_transform_points4_perspective;
+      _mesa_transform_tab[4][MATRIX_2D_NO_ROT] =
+	  _mesa_3dnow_transform_points4_2d_no_rot;
+      _mesa_transform_tab[4][MATRIX_2D] =
+	  _mesa_3dnow_transform_points4_2d;
+
+   }
+
+   
+#ifdef DEBUG_MATH
+   _math_test_all_transform_functions("x86_64");
+   _math_test_all_cliptest_functions("x86_64");
+   _math_test_all_normal_transform_functions("x86_64");
+#endif
+
+#endif
+}
diff --git a/src/arch/x86-64/x86-64.h b/src/arch/x86-64/x86-64.h
new file mode 100644
index 0000000..1d931fa
--- /dev/null
+++ b/src/arch/x86-64/x86-64.h
@@ -0,0 +1,31 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __X86_64_ASM_H__
+#define __X86_64_ASM_H__
+
+extern void _mesa_init_all_x86_64_transform_asm( void );
+
+#endif
diff --git a/src/arch/x86-64/xform4.S b/src/arch/x86-64/xform4.S
new file mode 100644
index 0000000..5abd5a2
--- /dev/null
+++ b/src/arch/x86-64/xform4.S
@@ -0,0 +1,483 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.1
+ *
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef USE_X86_64_ASM
+
+#include "matypes.h"
+
+.text
+
+.align 16
+.globl _mesa_x86_64_cpuid
+.hidden _mesa_x86_64_cpuid
+_mesa_x86_64_cpuid:
+	pushq	%rbx
+	movl	(%rdi), %eax
+	movl	8(%rdi), %ecx
+
+	cpuid
+
+	movl	%ebx, 4(%rdi)
+	movl	%eax, (%rdi)
+	movl	%ecx, 8(%rdi)
+	movl	%edx, 12(%rdi)
+	popq	%rbx
+	ret
+
+.align 16
+.globl _mesa_x86_64_transform_points4_general
+.hidden _mesa_x86_64_transform_points4_general
+_mesa_x86_64_transform_points4_general:
+/*
+ *	rdi = dest
+ *	rsi = matrix
+ *	rdx = source
+ */
+	movl V4F_COUNT(%rdx), %ecx	/* count */
+	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
+
+	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
+	movl $4, V4F_SIZE(%rdi)		/* set dest size */
+	.byte 0x66, 0x66, 0x66, 0x90		/* manual align += 3 */
+	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+	
+	testl %ecx, %ecx		/* verify non-zero count */
+	prefetchnta 64(%rsi)
+	jz p4_general_done
+
+	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
+	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
+
+	prefetch 16(%rdx)
+
+	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
+	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
+	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
+	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
+        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
+
+p4_general_loop:
+
+	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
+	prefetchw 16(%rdi)
+
+	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
+	addq %rax, %rdx
+	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
+	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
+	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
+	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
+	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
+	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
+	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
+	prefetch 16(%rdx)
+	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
+
+	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
+	addq $16, %rdi
+
+	decl %ecx
+	jnz p4_general_loop
+
+p4_general_done:
+	.byte 0xf3
+	ret
+	
+.section .rodata
+
+.align 16
+p4_constants:
+.byte  0xff, 0xff, 0xff, 0xff
+.byte  0xff, 0xff, 0xff, 0xff
+.byte  0xff, 0xff, 0xff, 0xff
+.byte  0x00, 0x00, 0x00, 0x00
+
+.byte  0x00, 0x00, 0x00, 0x00
+.byte  0x00, 0x00, 0x00, 0x00
+.byte  0x00, 0x00, 0x00, 0x00
+.float 1.0
+
+.text
+.align 16
+.globl _mesa_x86_64_transform_points4_3d
+.hidden _mesa_x86_64_transform_points4_3d
+/*
+ * this is slower than _mesa_x86_64_transform_points4_general
+ * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
+ */
+_mesa_x86_64_transform_points4_3d:
+
+	leaq p4_constants(%rip), %rax
+
+	prefetchnta 64(%rsi)
+	
+	movaps (%rax), %xmm9
+	movaps 16(%rax), %xmm10
+
+	movl V4F_COUNT(%rdx), %ecx	/* count */
+	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
+
+	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
+	movl $4, V4F_SIZE(%rdi)		/* set dest size */
+	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+	
+	testl %ecx, %ecx		/* verify non-zero count */
+	jz p4_3d_done
+
+	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
+	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
+
+	prefetch 16(%rdx)
+
+	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
+	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
+	andps  %xmm9, %xmm4             /* 0.0 | m2  | m1  | m0  */
+	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
+	andps  %xmm9, %xmm5		/* 0.0 | m6  | m5  | m4  */
+        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
+	andps  %xmm9, %xmm6		/* 0.0 | m10 | m9  | m8  */
+	andps  %xmm9, %xmm7		/* 0.0 | m14 | m13 | m12  */
+	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
+	orps   %xmm10, %xmm7		/* 1.0 | m14 | m13 | m12  */
+
+p4_3d_loop:
+
+	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
+	prefetchw 16(%rdi)
+
+	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
+	addq %rax, %rdx
+	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
+	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
+	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
+	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
+	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
+	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
+	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
+	prefetch 16(%rdx)
+	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
+
+	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
+	addq $16, %rdi
+
+	dec %ecx
+	jnz p4_3d_loop
+
+p4_3d_done:
+	.byte 0xf3
+	ret
+
+
+.align 16
+.globl _mesa_x86_64_transform_points4_identity
+.hidden _mesa_x86_64_transform_points4_identity
+_mesa_x86_64_transform_points4_identity:
+
+	movl V4F_COUNT(%rdx), %ecx	/* count */
+	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
+
+	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
+	movl $4, V4F_SIZE(%rdi)		/* set dest size */
+	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+	
+	test %ecx, %ecx
+	jz p4_identity_done
+
+	movq V4F_START(%rdx), %rsi	/* ptr to first src vertex */
+	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
+	prefetch 64(%rsi)
+	prefetchw 64(%rdi)
+
+	add %ecx, %ecx
+
+	rep movsq
+
+p4_identity_done:
+	.byte 0xf3
+	ret
+
+	
+.align 16
+.globl _mesa_3dnow_transform_points4_3d_no_rot
+.hidden _mesa_3dnow_transform_points4_3d_no_rot
+_mesa_3dnow_transform_points4_3d_no_rot:
+
+	movl V4F_COUNT(%rdx), %ecx	/* count */
+	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
+
+	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
+	movl $4, V4F_SIZE(%rdi)		/* set dest size */
+	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
+	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+	
+	test %ecx, %ecx
+	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
+	jz p4_3d_no_rot_done
+
+	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
+	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
+
+	prefetch (%rdx)
+	
+	movd (%rsi), %mm0		/*                 | m00             */
+	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
+	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
+
+	movd 40(%rsi), %mm2		/*                 | m22             */
+	movq 48(%rsi), %mm1		/* m31             | m30             */
+
+	punpckldq 56(%rsi), %mm2	/* m11             | m00             */
+
+p4_3d_no_rot_loop:
+
+	prefetchw 32(%rdi)
+	
+	movq  (%rdx), %mm4		/* x1              | x0              */
+	movq  8(%rdx), %mm5		/* x3              | x2              */
+	movd  12(%rdx), %mm7		/*                 | x3              */
+
+	movq  %mm5, %mm6		/* x3              | x2              */
+	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
+
+	punpckhdq %mm6, %mm6		/* x3              | x3              */
+	pfmul %mm2, %mm5		/* x3*m32          | x2*m22          */
+
+	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
+	pfacc %mm7, %mm5		/* x3              | x2*m22+x3*m32   */
+
+        pfadd %mm6, %mm4		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
+
+	addq %rax, %rdx
+	movq %mm4, (%rdi)		/* write r0, r1                      */
+	movq %mm5, 8(%rdi)		/* write r2, r3                      */
+
+	addq $16, %rdi
+	
+	decl %ecx
+	prefetch 32(%rdx)
+	jnz p4_3d_no_rot_loop
+
+p4_3d_no_rot_done:
+	femms
+	ret
+
+	
+.align 16
+.globl _mesa_3dnow_transform_points4_perspective
+.hidden _mesa_3dnow_transform_points4_perspective
+_mesa_3dnow_transform_points4_perspective:
+
+	movl V4F_COUNT(%rdx), %ecx	/* count */
+	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
+
+	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
+	movl $4, V4F_SIZE(%rdi)		/* set dest size */
+	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+	
+	test %ecx, %ecx
+	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
+	jz p4_perspective_done
+
+	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
+	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
+
+	movd (%rsi), %mm0		/*                 | m00             */
+        pxor %mm7, %mm7			/* 0               | 0               */
+	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
+	
+	movq 32(%rsi), %mm2		/* m21             | m20             */
+	prefetch (%rdx)
+	
+	movd 40(%rsi), %mm1		/*                 | m22             */
+
+	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
+	punpckldq 56(%rsi), %mm1	/* m32             | m22             */
+
+
+p4_perspective_loop:
+
+	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
+
+	movq (%rdx), %mm4		/* x1              | x0              */
+	movq 8(%rdx), %mm5		/* x3              | x2              */
+	movd 8(%rdx), %mm3		/*                 | x2              */
+
+	movq %mm5, %mm6			/* x3              | x2              */
+	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
+
+	punpckldq %mm5, %mm5		/* x2              | x2              */
+
+	pfmul %mm2, %mm5		/* x2*m21          | x2*m20          */
+	pfsubr %mm7, %mm3		/*                 | -x2             */
+
+	pfmul %mm1, %mm6		/* x3*m32          | x2*m22          */
+	pfadd %mm4, %mm5		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
+
+	pfacc %mm3, %mm6		/* -x2             | x2*m22+x3*m32   */
+
+	movq %mm5, (%rdi)		/* write r0, r1                      */
+	addq %rax, %rdx	
+	movq %mm6, 8(%rdi)		/* write r2, r3                      */
+
+	addq $16, %rdi
+
+	decl %ecx
+	prefetch 32(%rdx)		/* hopefully stride is zero          */
+	jnz p4_perspective_loop
+
+p4_perspective_done:
+	femms
+	ret
+
+.align 16
+.globl _mesa_3dnow_transform_points4_2d_no_rot
+.hidden _mesa_3dnow_transform_points4_2d_no_rot
+_mesa_3dnow_transform_points4_2d_no_rot:
+
+	movl V4F_COUNT(%rdx), %ecx	/* count */
+	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
+
+	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
+	movl $4, V4F_SIZE(%rdi)		/* set dest size */
+	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+	
+	test %ecx, %ecx
+	.byte 0x90			/* manual align += 1 */
+	jz p4_2d_no_rot_done
+
+	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
+	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
+
+	movd (%rsi), %mm0		/*                 | m00             */
+	prefetch (%rdx)
+	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
+	
+	movq 48(%rsi), %mm1		/* m31             | m30             */
+
+p4_2d_no_rot_loop:
+
+	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
+
+	movq (%rdx), %mm4		/* x1              | x0              */
+	movq 8(%rdx), %mm5		/* x3              | x2              */
+
+	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
+	movq %mm5, %mm6			/* x3              | x2              */
+
+	punpckhdq %mm6, %mm6		/* x3              | x3              */
+
+	addq %rax, %rdx	
+	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
+
+	prefetch 32(%rdx)		/* hopefully stride is zero          */
+	pfadd %mm4, %mm6		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
+
+	movq %mm6, (%rdi)		/* write r0, r1                      */
+	movq %mm5, 8(%rdi)		/* write r2, r3                      */
+
+	addq $16, %rdi
+
+	decl %ecx
+	jnz p4_2d_no_rot_loop
+
+p4_2d_no_rot_done:
+	femms
+	ret
+
+	
+.align 16
+.globl _mesa_3dnow_transform_points4_2d
+.hidden _mesa_3dnow_transform_points4_2d
+_mesa_3dnow_transform_points4_2d:
+
+	movl V4F_COUNT(%rdx), %ecx	/* count */
+	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
+
+	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
+	movl $4, V4F_SIZE(%rdi)		/* set dest size */
+	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
+	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+	
+	test %ecx, %ecx
+	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
+	jz p4_2d_done
+
+	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
+	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
+
+	movd (%rsi), %mm0		/*                 | m00             */
+	movd 4(%rsi), %mm1		/*                 | m01             */
+
+	prefetch (%rdx)
+
+	punpckldq 16(%rsi), %mm0	/* m10             | m00             */
+	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
+	punpckldq 20(%rsi), %mm1	/* m11             | m01             */
+
+	movq 48(%rsi), %mm2		/* m31             | m30             */
+
+p4_2d_loop:
+
+	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
+
+	movq (%rdx), %mm3		/* x1              | x0              */
+	movq 8(%rdx), %mm5		/* x3              | x2              */
+
+	movq %mm3, %mm4			/* x1              | x0              */
+	movq %mm5, %mm6			/* x3              | x2              */
+
+	pfmul %mm1, %mm4		/* x1*m11          | x0*m01          */
+	punpckhdq %mm6, %mm6		/* x3              | x3              */
+
+	pfmul %mm0, %mm3		/* x1*m10          | x0*m00          */
+
+	addq %rax, %rdx	
+	pfacc %mm4, %mm3		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
+
+	pfmul %mm2, %mm6		/* x3*m31          | x3*m30          */
+	prefetch 32(%rdx)		/* hopefully stride is zero          */
+
+	pfadd %mm6, %mm3		/* r1              | r0              */
+
+	movq %mm3, (%rdi)		/* write r0, r1                      */
+	movq %mm5, 8(%rdi)		/* write r2, r3                      */
+
+	addq $16, %rdi
+
+	decl %ecx
+	jnz p4_2d_loop
+
+p4_2d_done:
+	femms
+	ret
+			
+#endif
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif