mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 05:18:08 +02:00
x86-64 transform optimizations (Mikko T.)
This commit is contained in:
parent
e3f684b753
commit
42fa81275c
17 changed files with 947 additions and 24 deletions
1
Makefile
1
Makefile
|
|
@ -108,6 +108,7 @@ linux-x86 \
|
|||
linux-x86-debug \
|
||||
linux-x86-32 \
|
||||
linux-x86-64 \
|
||||
linux-x86-64-debug \
|
||||
linux-x86-64-static \
|
||||
linux-x86-glide \
|
||||
linux-x86-static \
|
||||
|
|
|
|||
|
|
@ -8,14 +8,14 @@ CONFIG_NAME = linux-x86-64
|
|||
CC = gcc
|
||||
CXX = g++
|
||||
|
||||
CFLAGS = -m64 -Wall -O3 -ansi -pedantic -fPIC -D_POSIX_SOURCE -D_POSIX_C_SOURCE=199309L -D_SVID_SOURCE -D_BSD_SOURCE -DUSE_XSHM -DPTHREADS -I/usr/X11R6/include
|
||||
CFLAGS = -m64 -Wall -O3 -std=c99 -pedantic -fPIC -D_REENTRANT -D_POSIX_SOURCE -D_POSIX_C_SOURCE=199309L -D_SVID_SOURCE -D_BSD_SOURCE -DUSE_XSHM -DPTHREADS -I/usr/X11R6/include -DUSE_X86_64_ASM
|
||||
|
||||
CXXFLAGS = -m64 -Wall -O3 -ansi -pedantic -fPIC -D_POSIX_SOURCE -D_POSIX_C_SOURCE=199309L -D_SVID_SOURCE -D_BSD_SOURCE
|
||||
CXXFLAGS = -m64 -Wall -O3 -std=c99 -pedantic -fPIC -D_REENTRANT -D_POSIX_SOURCE -D_POSIX_C_SOURCE=199309L -D_SVID_SOURCE -D_BSD_SOURCE
|
||||
|
||||
GLUT_CFLAGS = -fexceptions
|
||||
|
||||
|
||||
#ASM_SOURCES = $(X86_SOURCES)
|
||||
ASM_SOURCES = $(X86-64_SOURCES)
|
||||
|
||||
|
||||
LIB_DIR = $(TOP)/lib64
|
||||
|
|
|
|||
28
configs/linux-x86-64-debug
Normal file
28
configs/linux-x86-64-debug
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
# Configuration for Linux for 64-bit X86 (Opteron)
|
||||
|
||||
include $(TOP)/configs/default
|
||||
|
||||
CONFIG_NAME = linux-x86-64-debug
|
||||
|
||||
# Compiler and flags
|
||||
CC = gcc
|
||||
CXX = g++
|
||||
|
||||
CFLAGS = -g -m64 -Wall -O3 -std=c99 -pedantic -fPIC -D_REENTRANT -D_POSIX_SOURCE -D_POSIX_C_SOURCE=199309L -D_SVID_SOURCE -D_BSD_SOURCE -DUSE_XSHM -DPTHREADS -I/usr/X11R6/include -DUSE_X86_64_ASM -DDEBUG -DMESA_DEBUG -DRUN_DEBUG_BENCHMARK
|
||||
|
||||
CXXFLAGS = -g -m64 -Wall -O3 -ansi -pedantic -fPIC -D_REENTRANT -D_POSIX_SOURCE -D_POSIX_C_SOURCE=199309L -D_SVID_SOURCE -D_BSD_SOURCE -DDEBUG -DMESA_DEBUG -DRUN_DEBUG_BENCHMARK
|
||||
|
||||
GLUT_CFLAGS = -fexceptions
|
||||
|
||||
|
||||
ASM_SOURCES = $(X86-64_SOURCES)
|
||||
|
||||
|
||||
LIB_DIR = $(TOP)/lib64
|
||||
|
||||
|
||||
# Library/program dependencies
|
||||
GL_LIB_DEPS = -L/usr/X11R6/lib64 -lX11 -lXext -lm -lpthread
|
||||
GLUT_LIB_DEPS = -L$(LIB_DIR) -l$(GLU_LIB) -l$(GL_LIB) -L/usr/X11R6/lib64 -lX11 -lXmu -lXt -lXi -lm
|
||||
GLW_LIB_DEPS = -L$(LIB_DIR) -l$(GL_LIB) -L/usr/X11R6/lib64 -lXt -lX11
|
||||
APP_LIB_DEPS = -L$(LIB_DIR) -l$(GLUT_LIB) -l$(GLU_LIB) -l$(GL_LIB) -lm
|
||||
|
|
@ -146,6 +146,7 @@ osmesa-only: depend subdirs $(LIB_DIR)/$(OSMESA_LIB_NAME)
|
|||
|
||||
subdirs:
|
||||
@ (cd x86 ; $(MAKE))
|
||||
@ (cd x86-64 ; $(MAKE))
|
||||
|
||||
# Make the GL library
|
||||
$(LIB_DIR)/$(GL_LIB_NAME): $(STAND_ALONE_OBJECTS)
|
||||
|
|
@ -223,5 +224,6 @@ clean:
|
|||
-rm -f drivers/*/*.o
|
||||
(cd drivers/dri ; $(MAKE) clean)
|
||||
(cd x86 ; $(MAKE) clean)
|
||||
(cd x86-64 ; $(MAKE) clean)
|
||||
|
||||
include depend
|
||||
|
|
|
|||
|
|
@ -185,6 +185,44 @@ extern char *mesa_profile;
|
|||
|
||||
#endif
|
||||
|
||||
#elif defined(__amd64__)
|
||||
|
||||
#define rdtscll(val) do { \
|
||||
unsigned int a,d; \
|
||||
__asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); \
|
||||
(val) = ((unsigned long)a) | (((unsigned long)d)<<32); \
|
||||
} while(0)
|
||||
|
||||
/* Copied from i386 PIII version */
|
||||
#define INIT_COUNTER() \
|
||||
do { \
|
||||
int cycle_i; \
|
||||
counter_overhead = LONG_MAX; \
|
||||
for ( cycle_i = 0 ; cycle_i < 16 ; cycle_i++ ) { \
|
||||
unsigned long cycle_tmp1, cycle_tmp2; \
|
||||
rdtscll(cycle_tmp1); \
|
||||
rdtscll(cycle_tmp2); \
|
||||
if ( counter_overhead > (cycle_tmp2 - cycle_tmp1) ) { \
|
||||
counter_overhead = cycle_tmp2 - cycle_tmp1; \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define BEGIN_RACE(x) \
|
||||
x = LONG_MAX; \
|
||||
for ( cycle_i = 0 ; cycle_i < 10 ; cycle_i++ ) { \
|
||||
unsigned long cycle_tmp1, cycle_tmp2; \
|
||||
rdtscll(cycle_tmp1); \
|
||||
|
||||
#define END_RACE(x) \
|
||||
rdtscll(cycle_tmp2); \
|
||||
if ( x > (cycle_tmp2 - cycle_tmp1) ) { \
|
||||
x = cycle_tmp2 - cycle_tmp1; \
|
||||
} \
|
||||
} \
|
||||
x -= counter_overhead;
|
||||
|
||||
#elif defined(__sparc__)
|
||||
|
||||
#define INIT_COUNTER() \
|
||||
|
|
|
|||
|
|
@ -166,7 +166,7 @@ ALIGN16(static GLfloat, d[TEST_COUNT][4]);
|
|||
ALIGN16(static GLfloat, r[TEST_COUNT][4]);
|
||||
|
||||
static int test_transform_function( transform_func func, int psize,
|
||||
int mtype, long *cycles )
|
||||
int mtype, unsigned long *cycles )
|
||||
{
|
||||
GLvector4f source[1], dest[1], ref[1];
|
||||
GLmatrix mat[1];
|
||||
|
|
@ -187,7 +187,7 @@ static int test_transform_function( transform_func func, int psize,
|
|||
mat->type = mtypes[mtype];
|
||||
|
||||
m = mat->m;
|
||||
ASSERT( ((GLuint)m & 15) == 0 );
|
||||
ASSERT( ((long)m & 15) == 0 );
|
||||
|
||||
init_matrix( m );
|
||||
|
||||
|
|
@ -279,7 +279,7 @@ static int test_transform_function( transform_func func, int psize,
|
|||
void _math_test_all_transform_functions( char *description )
|
||||
{
|
||||
int psize, mtype;
|
||||
long benchmark_tab[4][7];
|
||||
unsigned long benchmark_tab[4][7];
|
||||
static int first_time = 1;
|
||||
|
||||
if ( first_time ) {
|
||||
|
|
@ -291,7 +291,7 @@ void _math_test_all_transform_functions( char *description )
|
|||
if ( mesa_profile ) {
|
||||
if ( !counter_overhead ) {
|
||||
INIT_COUNTER();
|
||||
_mesa_printf("counter overhead: %ld cycles\n\n", counter_overhead );
|
||||
_mesa_printf("counter overhead: %lu cycles\n\n", counter_overhead );
|
||||
}
|
||||
_mesa_printf("transform results after hooking in %s functions:\n", description );
|
||||
}
|
||||
|
|
@ -310,7 +310,7 @@ void _math_test_all_transform_functions( char *description )
|
|||
for ( mtype = 0 ; mtype < 7 ; mtype++ ) {
|
||||
for ( psize = 1 ; psize <= 4 ; psize++ ) {
|
||||
transform_func func = _mesa_transform_tab[psize][mtypes[mtype]];
|
||||
long *cycles = &(benchmark_tab[psize-1][mtype]);
|
||||
unsigned long *cycles = &(benchmark_tab[psize-1][mtype]);
|
||||
|
||||
if ( test_transform_function( func, psize, mtype, cycles ) == 0 ) {
|
||||
char buf[100];
|
||||
|
|
|
|||
|
|
@ -51,6 +51,10 @@
|
|||
#include "x86/common_x86_asm.h"
|
||||
#endif
|
||||
|
||||
#ifdef USE_X86_64_ASM
|
||||
#include "x86-64/x86-64.h"
|
||||
#endif
|
||||
|
||||
#ifdef USE_SPARC_ASM
|
||||
#include "sparc/sparc.h"
|
||||
#endif
|
||||
|
|
@ -212,6 +216,8 @@ _math_init_transformation( void )
|
|||
_mesa_init_all_sparc_transform_asm();
|
||||
#elif defined( USE_PPC_ASM )
|
||||
_mesa_init_all_ppc_transform_asm();
|
||||
#elif defined( USE_X86_64_ASM )
|
||||
_mesa_init_all_x86_64_transform_asm();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -197,7 +197,8 @@ ASM_C_SOURCES = \
|
|||
x86/3dnow.c \
|
||||
x86/sse.c \
|
||||
sparc/sparc.c \
|
||||
ppc/common_ppc.c
|
||||
ppc/common_ppc.c \
|
||||
x86-64/x86-64.c
|
||||
|
||||
X86_SOURCES = \
|
||||
x86/common_x86_asm.S \
|
||||
|
|
@ -222,6 +223,9 @@ X86_SOURCES = \
|
|||
X86_API = \
|
||||
x86/glapi_x86.S
|
||||
|
||||
X86-64_SOURCES = \
|
||||
x86-64/xform4.S
|
||||
|
||||
SPARC_SOURCES = \
|
||||
sparc/clip.S \
|
||||
sparc/norm.S \
|
||||
|
|
|
|||
29
src/mesa/x86-64/Makefile
Normal file
29
src/mesa/x86-64/Makefile
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
# src/mesa/x86-64/Makefile
|
||||
|
||||
TOP = ../../..
|
||||
|
||||
include $(TOP)/configs/current
|
||||
|
||||
|
||||
|
||||
INCLUDE_DIRS = \
|
||||
-I$(TOP)/include/GL \
|
||||
-I$(TOP)/include \
|
||||
-I.. \
|
||||
-I../main \
|
||||
-I../math \
|
||||
-I../glapi \
|
||||
-I../tnl
|
||||
|
||||
|
||||
default: matypes.h
|
||||
|
||||
clean:
|
||||
rm -f matypes.h
|
||||
|
||||
|
||||
# need some special rules here, unfortunately
|
||||
matypes.h: ../main/mtypes.h ../tnl/t_context.h ../x86/gen_matypes
|
||||
../x86/gen_matypes | grep -v '#include "assyntax.h' > matypes.h
|
||||
|
||||
xform4.o: matypes.h
|
||||
50
src/mesa/x86-64/calling_convention.txt
Normal file
50
src/mesa/x86-64/calling_convention.txt
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
Register Usage
|
||||
rax temporary register; with variable arguments passes information
|
||||
about the number of SSE registers used; 1st return register
|
||||
|
||||
rbx* callee-saved register; optionally used as base pointer
|
||||
|
||||
rcx used to pass 4th integer argument to functions
|
||||
|
||||
rdx used to pass 3rd argument to functions 2nd return register
|
||||
|
||||
rsp* stack pointer
|
||||
|
||||
rbp* callee-saved register; optionally used as frame pointer
|
||||
|
||||
rsi used to pass 2nd argument to functions
|
||||
|
||||
rdi used to pass 1st argument to functions
|
||||
|
||||
r8 used to pass 5th argument to functions
|
||||
|
||||
r9 used to pass 6th argument to functions
|
||||
|
||||
r10 temporary register, used for passing a function's static chain pointer
|
||||
|
||||
r11 temporary register
|
||||
|
||||
r12-15* callee-saved registers
|
||||
|
||||
xmm01 used to pass and return floating point arguments
|
||||
|
||||
xmm27 used to pass floating point arguments
|
||||
|
||||
xmm815 temporary registers
|
||||
|
||||
mmx07 temporary registers
|
||||
|
||||
st0 temporary register; used to return long double arguments
|
||||
|
||||
st1 temporary registers; used to return long double arguments
|
||||
|
||||
st27 temporary registers
|
||||
|
||||
fs Reserved for system use (as thread specific data register)
|
||||
|
||||
|
||||
|
||||
*) must be preserved across function calls
|
||||
|
||||
Integer arguments from list: rdi,rsi,rdx,rcx,r8,r9,stack
|
||||
Floating point arguments from list: xmm0-xmm7
|
||||
164
src/mesa/x86-64/matypes.h
Normal file
164
src/mesa/x86-64/matypes.h
Normal file
|
|
@ -0,0 +1,164 @@
|
|||
/*
|
||||
* This file is automatically generated from the Mesa internal type
|
||||
* definitions. Do not edit directly.
|
||||
*/
|
||||
|
||||
#ifndef __ASM_TYPES_H__
|
||||
#define __ASM_TYPES_H__
|
||||
|
||||
|
||||
|
||||
/* =============================================================
|
||||
* Offsets for GLcontext
|
||||
*/
|
||||
|
||||
#define CTX_DRIVER_CTX 904
|
||||
|
||||
#define CTX_LIGHT_ENABLED 38592
|
||||
#define CTX_LIGHT_SHADE_MODEL 38596
|
||||
#define CTX_LIGHT_COLOR_MAT_FACE 38600
|
||||
#define CTX_LIGHT_COLOR_MAT_MODE 38604
|
||||
#define CTX_LIGHT_COLOR_MAT_MASK 38608
|
||||
#define CTX_LIGHT_COLOR_MAT_ENABLED 38612
|
||||
#define CTX_LIGHT_ENABLED_LIST 38616
|
||||
#define CTX_LIGHT_NEED_VERTS 42973
|
||||
#define CTX_LIGHT_FLAGS 42976
|
||||
#define CTX_LIGHT_BASE_COLOR 42980
|
||||
|
||||
|
||||
/* =============================================================
|
||||
* Offsets for struct vertex_buffer
|
||||
*/
|
||||
|
||||
#define VB_SIZE 0
|
||||
#define VB_COUNT 4
|
||||
|
||||
#define VB_ELTS 8
|
||||
#define VB_OBJ_PTR 12
|
||||
#define VB_EYE_PTR 16
|
||||
#define VB_CLIP_PTR 20
|
||||
#define VB_PROJ_CLIP_PTR 24
|
||||
#define VB_CLIP_OR_MASK 28
|
||||
#define VB_CLIP_MASK 32
|
||||
#define VB_NORMAL_PTR 36
|
||||
#define VB_EDGE_FLAG 44
|
||||
#define VB_TEX0_COORD_PTR 48
|
||||
#define VB_TEX1_COORD_PTR 52
|
||||
#define VB_TEX2_COORD_PTR 56
|
||||
#define VB_TEX3_COORD_PTR 60
|
||||
#define VB_INDEX_PTR 80
|
||||
#define VB_COLOR_PTR 88
|
||||
#define VB_SECONDARY_COLOR_PTR 96
|
||||
#define VB_FOG_COORD_PTR 108
|
||||
#define VB_POINT_SIZE_PTR 104
|
||||
#define VB_PRIMITIVE 112
|
||||
|
||||
#define VB_LAST_CLIPPED 244
|
||||
|
||||
/*
|
||||
* Flags for struct vertex_buffer
|
||||
*/
|
||||
|
||||
#define VERT_BIT_OBJ 0x1
|
||||
#define VERT_BIT_NORM 0x4
|
||||
#define VERT_BIT_RGBA 0x8
|
||||
#define VERT_BIT_SPEC_RGB 0x10
|
||||
#define VERT_BIT_FOG_COORD 0x20
|
||||
#define VERT_BIT_TEX0 0x100
|
||||
#define VERT_BIT_TEX1 0x200
|
||||
#define VERT_BIT_TEX2 0x400
|
||||
#define VERT_BIT_TEX3 0x800
|
||||
|
||||
|
||||
/* =============================================================
|
||||
* Offsets for GLvector4f
|
||||
*/
|
||||
|
||||
#define V4F_DATA 0
|
||||
#define V4F_START 4
|
||||
#define V4F_COUNT 8
|
||||
#define V4F_STRIDE 12
|
||||
#define V4F_SIZE 16
|
||||
#define V4F_FLAGS 20
|
||||
|
||||
/*
|
||||
* Flags for GLvector4f
|
||||
*/
|
||||
|
||||
#define VEC_MALLOC 0x10
|
||||
#define VEC_NOT_WRITEABLE 0x40
|
||||
#define VEC_BAD_STRIDE 0x100
|
||||
|
||||
#define VEC_SIZE_1 0x1
|
||||
#define VEC_SIZE_2 0x3
|
||||
#define VEC_SIZE_3 0x7
|
||||
#define VEC_SIZE_4 0xf
|
||||
|
||||
|
||||
/* =============================================================
|
||||
* Offsets for GLmatrix
|
||||
*/
|
||||
|
||||
#define MATRIX_DATA 0
|
||||
#define MATRIX_INV 4
|
||||
#define MATRIX_FLAGS 8
|
||||
#define MATRIX_TYPE 12
|
||||
|
||||
|
||||
/* =============================================================
|
||||
* Offsets for struct gl_light
|
||||
*/
|
||||
|
||||
#define LIGHT_NEXT 0
|
||||
#define LIGHT_PREV 4
|
||||
|
||||
#define LIGHT_AMBIENT 8
|
||||
#define LIGHT_DIFFUSE 24
|
||||
#define LIGHT_SPECULAR 40
|
||||
#define LIGHT_EYE_POSITION 56
|
||||
#define LIGHT_EYE_DIRECTION 72
|
||||
#define LIGHT_SPOT_EXPONENT 88
|
||||
#define LIGHT_SPOT_CUTOFF 92
|
||||
#define LIGHT_COS_CUTOFF 96
|
||||
#define LIGHT_CONST_ATTEN 100
|
||||
#define LIGHT_LINEAR_ATTEN 104
|
||||
#define LIGHT_QUADRATIC_ATTEN 108
|
||||
#define LIGHT_ENABLED 112
|
||||
|
||||
#define LIGHT_FLAGS 116
|
||||
|
||||
#define LIGHT_POSITION 120
|
||||
#define LIGHT_VP_INF_NORM 136
|
||||
#define LIGHT_H_INF_NORM 148
|
||||
#define LIGHT_NORM_DIRECTION 160
|
||||
#define LIGHT_VP_INF_SPOT_ATTEN 176
|
||||
|
||||
#define LIGHT_SPOT_EXP_TABLE 180
|
||||
#define LIGHT_MAT_AMBIENT 4276
|
||||
#define LIGHT_MAT_DIFFUSE 4300
|
||||
#define LIGHT_MAT_SPECULAR 4324
|
||||
|
||||
#define SIZEOF_GL_LIGHT 4356
|
||||
|
||||
/*
|
||||
* Flags for struct gl_light
|
||||
*/
|
||||
|
||||
#define LIGHT_SPOT 0x1
|
||||
#define LIGHT_LOCAL_VIEWER 0x2
|
||||
#define LIGHT_POSITIONAL 0x4
|
||||
|
||||
#define LIGHT_NEED_VERTICES 0x6
|
||||
|
||||
|
||||
/* =============================================================
|
||||
* Offsets for struct gl_lightmodel
|
||||
*/
|
||||
|
||||
#define LIGHT_MODEL_AMBIENT 0
|
||||
#define LIGHT_MODEL_LOCAL_VIEWER 16
|
||||
#define LIGHT_MODEL_TWO_SIDE 17
|
||||
#define LIGHT_MODEL_COLOR_CONTROL 20
|
||||
|
||||
|
||||
#endif /* __ASM_TYPES_H__ */
|
||||
115
src/mesa/x86-64/x86-64.c
Normal file
115
src/mesa/x86-64/x86-64.c
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
/* $Id: x86-64.c,v 1.1 2005/05/07 16:59:59 brianp Exp $ */
|
||||
|
||||
/*
|
||||
* Mesa 3-D graphics library
|
||||
* Version: 6.3
|
||||
*
|
||||
* Copyright (C) 1999-2003 Brian Paul All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
|
||||
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* x86-64 optimizations shamelessy converted from x86/sse/3dnow assembly by
|
||||
* Mikko Tiihonen
|
||||
*/
|
||||
|
||||
#ifdef USE_X86_64_ASM
|
||||
|
||||
#include "glheader.h"
|
||||
#include "context.h"
|
||||
#include "math/m_xform.h"
|
||||
#include "tnl/t_context.h"
|
||||
#include "x86-64.h"
|
||||
#include "../x86/common_x86_macros.h"
|
||||
|
||||
#ifdef DEBUG
|
||||
#include "math/m_debug.h"
|
||||
#endif
|
||||
|
||||
DECLARE_XFORM_GROUP( x86_64, 4 )
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
extern void _mesa_x86_64_transform_points4_general( XFORM_ARGS );
|
||||
extern void _mesa_x86_64_transform_points4_identity( XFORM_ARGS );
|
||||
extern void _mesa_x86_64_transform_points4_perspective( XFORM_ARGS );
|
||||
extern void _mesa_x86_64_transform_points4_3d( XFORM_ARGS );
|
||||
extern void _mesa_x86_64_transform_points4_3d_no_rot( XFORM_ARGS );
|
||||
extern void _mesa_x86_64_transform_points4_2d_no_rot( XFORM_ARGS );
|
||||
extern void _mesa_x86_64_transform_points4_2d( XFORM_ARGS );
|
||||
*/
|
||||
|
||||
#ifdef USE_X86_64_ASM
|
||||
static void message( const char *msg )
|
||||
{
|
||||
GLboolean debug;
|
||||
#ifdef DEBUG
|
||||
debug = GL_TRUE;
|
||||
#else
|
||||
if ( _mesa_getenv( "MESA_DEBUG" ) ) {
|
||||
debug = GL_TRUE;
|
||||
} else {
|
||||
debug = GL_FALSE;
|
||||
}
|
||||
#endif
|
||||
if ( debug ) {
|
||||
fprintf( stderr, "%s", msg );
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void _mesa_init_all_x86_64_transform_asm(void)
|
||||
{
|
||||
#ifdef USE_X86_64_ASM
|
||||
|
||||
if ( _mesa_getenv( "MESA_NO_ASM" ) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
message("Initializing x86-64 optimizations\n");
|
||||
|
||||
ASSIGN_XFORM_GROUP( x86_64, 4 );
|
||||
|
||||
/*
|
||||
_mesa_transform_tab[4][MATRIX_GENERAL] =
|
||||
_mesa_x86_64_transform_points4_general;
|
||||
_mesa_transform_tab[4][MATRIX_IDENTITY] =
|
||||
_mesa_x86_64_transform_points4_identity;
|
||||
_mesa_transform_tab[4][MATRIX_3D] =
|
||||
_mesa_x86_64_transform_points4_3d;
|
||||
_mesa_transform_tab[4][MATRIX_3D_NO_ROT] =
|
||||
_mesa_x86_64_transform_points4_3d_no_rot;
|
||||
_mesa_transform_tab[4][MATRIX_PERSPECTIVE] =
|
||||
_mesa_x86_64_transform_points4_perspective;
|
||||
_mesa_transform_tab[4][MATRIX_2D_NO_ROT] =
|
||||
_mesa_x86_64_transform_points4_2d_no_rot;
|
||||
_mesa_transform_tab[4][MATRIX_2D] =
|
||||
_mesa_x86_64_transform_points4_2d;
|
||||
*/
|
||||
|
||||
#ifdef DEBUG
|
||||
_math_test_all_transform_functions("x86_64");
|
||||
_math_test_all_cliptest_functions("x86_64");
|
||||
_math_test_all_normal_transform_functions("x86_64");
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
32
src/mesa/x86-64/x86-64.h
Normal file
32
src/mesa/x86-64/x86-64.h
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
/* $Id: x86-64.h,v 1.1 2005/05/07 16:59:59 brianp Exp $ */
|
||||
|
||||
/*
|
||||
* Mesa 3-D graphics library
|
||||
* Version: 3.5
|
||||
*
|
||||
* Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
|
||||
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef __X86_64_ASM_H__
|
||||
#define __X86_64_ASM_H__
|
||||
|
||||
extern void _mesa_init_all_x86_64_transform_asm( void );
|
||||
|
||||
#endif
|
||||
458
src/mesa/x86-64/xform4.S
Normal file
458
src/mesa/x86-64/xform4.S
Normal file
|
|
@ -0,0 +1,458 @@
|
|||
/* $Id: xform4.S,v 1.1 2005/05/07 16:59:59 brianp Exp $ */
|
||||
|
||||
/*
|
||||
* Mesa 3-D graphics library
|
||||
* Version: 3.5
|
||||
*
|
||||
* Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
|
||||
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifdef USE_X86_64_ASM
|
||||
|
||||
#include "matypes.h"
|
||||
|
||||
.text
|
||||
|
||||
.align 16
|
||||
|
||||
.globl _mesa_x86_64_transform_points4_general
|
||||
_mesa_x86_64_transform_points4_general:
|
||||
/*
|
||||
* rdi = dest
|
||||
* rsi = matrix
|
||||
* rdx = source
|
||||
*/
|
||||
movl V4F_COUNT(%rdx), %ecx /* count */
|
||||
movzx V4F_STRIDE(%rdx), %eax /* stride */
|
||||
|
||||
movl %ecx, V4F_COUNT(%rdi) /* set dest count */
|
||||
movl $4, V4F_SIZE(%rdi) /* set dest size */
|
||||
.byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
|
||||
orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
|
||||
|
||||
testl %ecx, %ecx /* verify non-zero count */
|
||||
prefetchnta 64(%rsi)
|
||||
jz p4_general_done
|
||||
|
||||
movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
|
||||
movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
|
||||
|
||||
prefetch 16(%rdx)
|
||||
|
||||
movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
|
||||
movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
|
||||
.byte 0x66, 0x66, 0x90 /* manual align += 3 */
|
||||
movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
|
||||
movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
|
||||
|
||||
p4_general_loop:
|
||||
|
||||
movaps (%rdx), %xmm8 /* ox | oy | oz | ow */
|
||||
prefetchw 16(%rdi)
|
||||
|
||||
pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
|
||||
addq %rax, %rdx
|
||||
pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
|
||||
mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
|
||||
pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
|
||||
mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
|
||||
pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
|
||||
mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
|
||||
addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
|
||||
mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
|
||||
addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
|
||||
prefetch 16(%rdx)
|
||||
addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
|
||||
|
||||
movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
|
||||
addq $16, %rdi
|
||||
|
||||
decl %ecx
|
||||
jnz p4_general_loop
|
||||
|
||||
p4_general_done:
|
||||
.byte 0xf3
|
||||
ret
|
||||
|
||||
.section .rodata
|
||||
|
||||
.align 16
|
||||
p4_constants:
|
||||
.byte 0xff, 0xff, 0xff, 0xff
|
||||
.byte 0xff, 0xff, 0xff, 0xff
|
||||
.byte 0xff, 0xff, 0xff, 0xff
|
||||
.byte 0x00, 0x00, 0x00, 0x00
|
||||
|
||||
.byte 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00
|
||||
.float 0f+1.0
|
||||
|
||||
.text
|
||||
.align 16
|
||||
.globl _mesa_x86_64_transform_points4_3d
|
||||
/*
|
||||
* this is slower than _mesa_x86_64_transform_points4_general
|
||||
* because it ensures that the last matrix row (or is it column?) is 0,0,0,1
|
||||
*/
|
||||
_mesa_x86_64_transform_points4_3d:
|
||||
|
||||
leaq p4_constants(%rip), %rax
|
||||
|
||||
prefetchnta 64(%rsi)
|
||||
|
||||
movaps (%rax), %xmm9
|
||||
movaps 16(%rax), %xmm10
|
||||
|
||||
movl V4F_COUNT(%rdx), %ecx /* count */
|
||||
movzx V4F_STRIDE(%rdx), %eax /* stride */
|
||||
|
||||
movl %ecx, V4F_COUNT(%rdi) /* set dest count */
|
||||
movl $4, V4F_SIZE(%rdi) /* set dest size */
|
||||
orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
|
||||
|
||||
testl %ecx, %ecx /* verify non-zero count */
|
||||
jz p4_3d_done
|
||||
|
||||
movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
|
||||
movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
|
||||
|
||||
prefetch 16(%rdx)
|
||||
|
||||
movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
|
||||
movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
|
||||
andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
|
||||
movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
|
||||
andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
|
||||
movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
|
||||
andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
|
||||
andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
|
||||
.byte 0x66, 0x66, 0x90 /* manual align += 3 */
|
||||
orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
|
||||
|
||||
p4_3d_loop:
|
||||
|
||||
movaps (%rdx), %xmm8 /* ox | oy | oz | ow */
|
||||
prefetchw 16(%rdi)
|
||||
|
||||
pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
|
||||
addq %rax, %rdx
|
||||
pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
|
||||
mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
|
||||
pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
|
||||
mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
|
||||
pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
|
||||
mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
|
||||
addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
|
||||
mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
|
||||
addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
|
||||
prefetch 16(%rdx)
|
||||
addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
|
||||
|
||||
movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
|
||||
addq $16, %rdi
|
||||
|
||||
dec %ecx
|
||||
jnz p4_3d_loop
|
||||
|
||||
p4_3d_done:
|
||||
.byte 0xf3
|
||||
ret
|
||||
|
||||
|
||||
.align 16
|
||||
.globl _mesa_x86_64_transform_points4_identity
|
||||
_mesa_x86_64_transform_points4_identity:
|
||||
|
||||
movl V4F_COUNT(%rdx), %ecx /* count */
|
||||
movzx V4F_STRIDE(%rdx), %eax /* stride */
|
||||
|
||||
movl %ecx, V4F_COUNT(%rdi) /* set dest count */
|
||||
movl $4, V4F_SIZE(%rdi) /* set dest size */
|
||||
orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
|
||||
|
||||
test %ecx, %ecx
|
||||
jz p4_identity_done
|
||||
|
||||
movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
|
||||
movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
|
||||
prefetch 64(%rsi)
|
||||
prefetchw 64(%rdi)
|
||||
|
||||
add %ecx, %ecx
|
||||
|
||||
rep movsq
|
||||
|
||||
p4_identity_done:
|
||||
.byte 0xf3
|
||||
ret
|
||||
|
||||
|
||||
.align 16
|
||||
.globl _mesa_x86_64_transform_points4_3d_no_rot
|
||||
_mesa_x86_64_transform_points4_3d_no_rot:
|
||||
|
||||
movl V4F_COUNT(%rdx), %ecx /* count */
|
||||
movzx V4F_STRIDE(%rdx), %eax /* stride */
|
||||
|
||||
movl %ecx, V4F_COUNT(%rdi) /* set dest count */
|
||||
movl $4, V4F_SIZE(%rdi) /* set dest size */
|
||||
.byte 0x66, 0x66, 0x90 /* manual align += 3 */
|
||||
orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
|
||||
|
||||
test %ecx, %ecx
|
||||
.byte 0x66, 0x66, 0x90 /* manual align += 3 */
|
||||
jz p4_3d_no_rot_done
|
||||
|
||||
movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
|
||||
movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
|
||||
|
||||
prefetch (%rdx)
|
||||
|
||||
movd (%rsi), %mm0 /* | m00 */
|
||||
.byte 0x66, 0x66, 0x90 /* manual align += 3 */
|
||||
punpckldq 20(%rsi), %mm0 /* m11 | m00 */
|
||||
|
||||
movd 40(%rsi), %mm2 /* | m22 */
|
||||
movq 48(%rsi), %mm1 /* m31 | m30 */
|
||||
|
||||
punpckldq 56(%rsi), %mm2 /* m11 | m00 */
|
||||
|
||||
p4_3d_no_rot_loop:
|
||||
|
||||
prefetchw 32(%rdi)
|
||||
|
||||
movq (%rdx), %mm4 /* x1 | x0 */
|
||||
movq 8(%rdx), %mm5 /* x3 | x2 */
|
||||
movd 12(%rdx), %mm7 /* | x3 */
|
||||
|
||||
movq %mm5, %mm6 /* x3 | x2 */
|
||||
pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
|
||||
|
||||
punpckhdq %mm6, %mm6 /* x3 | x3 */
|
||||
pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
|
||||
|
||||
pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
|
||||
pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
|
||||
|
||||
pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
|
||||
|
||||
addq %rax, %rdx
|
||||
movq %mm4, (%rdi) /* write r0, r1 */
|
||||
movq %mm5, 8(%rdi) /* write r2, r3 */
|
||||
|
||||
addq $16, %rdi
|
||||
|
||||
decl %ecx
|
||||
prefetch 32(%rdx)
|
||||
jnz p4_3d_no_rot_loop
|
||||
|
||||
p4_3d_no_rot_done:
|
||||
femms
|
||||
ret
|
||||
|
||||
|
||||
.align 16
|
||||
.globl _mesa_x86_64_transform_points4_perspective
|
||||
_mesa_x86_64_transform_points4_perspective:
|
||||
|
||||
movl V4F_COUNT(%rdx), %ecx /* count */
|
||||
movzx V4F_STRIDE(%rdx), %eax /* stride */
|
||||
|
||||
movl %ecx, V4F_COUNT(%rdi) /* set dest count */
|
||||
movl $4, V4F_SIZE(%rdi) /* set dest size */
|
||||
orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
|
||||
|
||||
test %ecx, %ecx
|
||||
.byte 0x66, 0x66, 0x90 /* manual align += 3 */
|
||||
jz p4_perspective_done
|
||||
|
||||
movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
|
||||
movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
|
||||
|
||||
movd (%rsi), %mm0 /* | m00 */
|
||||
pxor %mm7, %mm7 /* 0 | 0 */
|
||||
punpckldq 20(%rsi), %mm0 /* m11 | m00 */
|
||||
|
||||
movq 32(%rsi), %mm2 /* m21 | m20 */
|
||||
prefetch (%rdx)
|
||||
|
||||
movd 40(%rsi), %mm1 /* | m22 */
|
||||
|
||||
.byte 0x66, 0x66, 0x90 /* manual align += 3 */
|
||||
punpckldq 56(%rsi), %mm1 /* m32 | m22 */
|
||||
|
||||
|
||||
p4_perspective_loop:
|
||||
|
||||
prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
|
||||
|
||||
movq (%rdx), %mm4 /* x1 | x0 */
|
||||
movq 8(%rdx), %mm5 /* x3 | x2 */
|
||||
movd 8(%rdx), %mm3 /* | x2 */
|
||||
|
||||
movq %mm5, %mm6 /* x3 | x2 */
|
||||
pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
|
||||
|
||||
punpckldq %mm5, %mm5 /* x2 | x2 */
|
||||
|
||||
pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
|
||||
pfsubr %mm7, %mm3 /* | -x2 */
|
||||
|
||||
pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
|
||||
pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
|
||||
|
||||
pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
|
||||
|
||||
movq %mm5, (%rdi) /* write r0, r1 */
|
||||
addq %rax, %rdx
|
||||
movq %mm6, 8(%rdi) /* write r2, r3 */
|
||||
|
||||
addq $16, %rdi
|
||||
|
||||
decl %ecx
|
||||
prefetch 32(%rdx) /* hopefully stride is zero */
|
||||
jnz p4_perspective_loop
|
||||
|
||||
p4_perspective_done:
|
||||
femms
|
||||
ret
|
||||
|
||||
.align 16
|
||||
.globl _mesa_x86_64_transform_points4_2d_no_rot
|
||||
_mesa_x86_64_transform_points4_2d_no_rot:
|
||||
|
||||
movl V4F_COUNT(%rdx), %ecx /* count */
|
||||
movzx V4F_STRIDE(%rdx), %eax /* stride */
|
||||
|
||||
movl %ecx, V4F_COUNT(%rdi) /* set dest count */
|
||||
movl $4, V4F_SIZE(%rdi) /* set dest size */
|
||||
orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
|
||||
|
||||
test %ecx, %ecx
|
||||
.byte 0x90 /* manual align += 1 */
|
||||
jz p4_2d_no_rot_done
|
||||
|
||||
movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
|
||||
movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
|
||||
|
||||
movd (%rsi), %mm0 /* | m00 */
|
||||
prefetch (%rdx)
|
||||
punpckldq 20(%rsi), %mm0 /* m11 | m00 */
|
||||
|
||||
movq 48(%rsi), %mm1 /* m31 | m30 */
|
||||
|
||||
p4_2d_no_rot_loop:
|
||||
|
||||
prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
|
||||
|
||||
movq (%rdx), %mm4 /* x1 | x0 */
|
||||
movq 8(%rdx), %mm5 /* x3 | x2 */
|
||||
|
||||
pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
|
||||
movq %mm5, %mm6 /* x3 | x2 */
|
||||
|
||||
punpckhdq %mm6, %mm6 /* x3 | x3 */
|
||||
|
||||
addq %rax, %rdx
|
||||
pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
|
||||
|
||||
prefetch 32(%rdx) /* hopefully stride is zero */
|
||||
pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
|
||||
|
||||
movq %mm6, (%rdi) /* write r0, r1 */
|
||||
movq %mm5, 8(%rdi) /* write r2, r3 */
|
||||
|
||||
addq $16, %rdi
|
||||
|
||||
decl %ecx
|
||||
jnz p4_2d_no_rot_loop
|
||||
|
||||
p4_2d_no_rot_done:
|
||||
femms
|
||||
ret
|
||||
|
||||
|
||||
.align 16
|
||||
.globl _mesa_x86_64_transform_points4_2d
|
||||
_mesa_x86_64_transform_points4_2d:
|
||||
|
||||
movl V4F_COUNT(%rdx), %ecx /* count */
|
||||
movzx V4F_STRIDE(%rdx), %eax /* stride */
|
||||
|
||||
movl %ecx, V4F_COUNT(%rdi) /* set dest count */
|
||||
movl $4, V4F_SIZE(%rdi) /* set dest size */
|
||||
.byte 0x66, 0x66, 0x90 /* manual align += 4 */
|
||||
orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
|
||||
|
||||
test %ecx, %ecx
|
||||
.byte 0x66, 0x66, 0x90 /* manual align += 4 */
|
||||
jz p4_2d_done
|
||||
|
||||
movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
|
||||
movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
|
||||
|
||||
movd (%rsi), %mm0 /* | m00 */
|
||||
movd 4(%rsi), %mm1 /* | m01 */
|
||||
|
||||
prefetch (%rdx)
|
||||
|
||||
punpckldq 16(%rsi), %mm0 /* m10 | m00 */
|
||||
.byte 0x66, 0x66, 0x90 /* manual align += 4 */
|
||||
punpckldq 20(%rsi), %mm1 /* m11 | m01 */
|
||||
|
||||
movq 48(%rsi), %mm2 /* m31 | m30 */
|
||||
|
||||
p4_2d_loop:
|
||||
|
||||
prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
|
||||
|
||||
movq (%rdx), %mm3 /* x1 | x0 */
|
||||
movq 8(%rdx), %mm5 /* x3 | x2 */
|
||||
|
||||
movq %mm3, %mm4 /* x1 | x0 */
|
||||
movq %mm5, %mm6 /* x3 | x2 */
|
||||
|
||||
pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
|
||||
punpckhdq %mm6, %mm6 /* x3 | x3 */
|
||||
|
||||
pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
|
||||
|
||||
addq %rax, %rdx
|
||||
pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
|
||||
|
||||
pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
|
||||
prefetch 32(%rdx) /* hopefully stride is zero */
|
||||
|
||||
pfadd %mm6, %mm3 /* r1 | r0 */
|
||||
|
||||
movq %mm3, (%rdi) /* write r0, r1 */
|
||||
movq %mm5, 8(%rdi) /* write r2, r3 */
|
||||
|
||||
addq $16, %rdi
|
||||
|
||||
decl %ecx
|
||||
jnz p4_2d_loop
|
||||
|
||||
p4_2d_done:
|
||||
femms
|
||||
ret
|
||||
|
||||
#endif
|
||||
|
|
@ -1730,11 +1730,17 @@ SECTION _DATA public align=16 class=DATA use32 flat
|
|||
#define TLBL(a) CONCAT(a,$)
|
||||
#endif
|
||||
|
||||
/* hidden symbol visibility support */
|
||||
/* Hidden symbol visibility support.
|
||||
* If we build with gcc's -fvisibility=hidden flag, we'll need to change
|
||||
* the symbol visibility mode to 'default'.
|
||||
*/
|
||||
#if defined(GNU_ASSEMBLER) && !defined(__DJGPP__) && !defined(__MINGW32__)
|
||||
#define HIDDEN(a) .hidden a
|
||||
# define HIDDEN(x) .hidden x
|
||||
#elif defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303
|
||||
# pragma GCC visibility push(default)
|
||||
# define HIDDEN(x) .hidden x
|
||||
#else
|
||||
#define HIDDEN(a)
|
||||
# define HIDDEN(x)
|
||||
#endif
|
||||
|
||||
#endif /* __ASSYNTAX_H__ */
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ do { \
|
|||
printf( "\n" ); \
|
||||
} while (0)
|
||||
|
||||
#if defined(__BEOS__)
|
||||
#if defined(__BEOS__) || defined(_LP64)
|
||||
#define OFFSET( s, t, m ) \
|
||||
printf( "#define %s\t%ld\n", s, offsetof( t, m ) );
|
||||
#else
|
||||
|
|
@ -69,7 +69,7 @@ do { \
|
|||
printf( "#define %s\t%d\n", s, offsetof( t, m ) );
|
||||
#endif
|
||||
|
||||
#if defined(__BEOS__)
|
||||
#if defined(__BEOS__) || defined(_LP64)
|
||||
#define SIZEOF( s, t ) \
|
||||
printf( "#define %s\t%ld\n", s, sizeof(t) );
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -29,16 +29,6 @@
|
|||
#include "assyntax.h"
|
||||
#include "glapioffsets.h"
|
||||
|
||||
/* If we build with gcc's -fvisibility=hidden flag, we'll need to change
|
||||
* the symbol visibility mode to 'default'.
|
||||
*/
|
||||
#if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303
|
||||
# pragma GCC visibility push(default)
|
||||
# define HIDDEN(x) .hidden x
|
||||
#else
|
||||
# define HIDDEN(x)
|
||||
#endif
|
||||
|
||||
#ifndef __WIN32__
|
||||
|
||||
#if defined(STDCALL_API)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue