Definition of several utility macros for self-contained MMX operations such as scaling and lerping.

Restructured the MMX blending function to use a template, being only necessary to specify the main loop, which is also used for making the runin and runout sections.
Optimization of the MMX function after remembering that the multiplication was commutative (how can somebody forget this..) resulting in less register usage. Now there is no need for generate or read from memory any constant inside the loop.

Assemblers other than the GNU assembler can choke on the output of the C preprocessor since it was necessary to add line separators ';' to the defined macros.
This commit is contained in:
Jose Fonseca 2002-04-18 11:57:28 +00:00
parent 9ff3e9d992
commit 55d9ee83b4
2 changed files with 340 additions and 306 deletions

View file

@ -4,8 +4,10 @@
#include "matypes.h"
/*
* make the following approximation to the division (Sree)
/* integer multiplication - alpha plus one
*
* makes the following approximation to the division (Sree)
*
* rgb*a/255 ~= (rgb*(a+1)) >> 256
*
@ -13,12 +15,24 @@
*
* 0*0 = 0 and 255*255 = 255
*
* note this one should be used alone
* note that MX1 is a register with 0xffffffffffffffff constant which can be easily obtained making
*
* PCMPEQW ( MX1, MX1 )
*/
#define GMBT_ALPHA_PLUS_ONE 0
#define GMB_MULT_AP1( MP1, MA1, MP2, MA2, MX1 ) \
PSUBW ( MX1, MA1 ) /* a1 + 1 | a1 + 1 | a1 + 1 | a1 + 1 */ ;\
TWO(PSUBW ( MX1, MA2 )) /* a2 + 1 | a2 + 1 | a2 + 1 | a2 + 1 */ ;\
;\
PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\
TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\
;\
PSRLW ( CONST(8), MA1 ) /* t1 >> 8 ~= t1/255 */ ;\
TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 ~= t2/255 */
/*
* take the geometric series approximation to the division
/* integer multiplication - geometric series
*
* takes the geometric series approximation to the division
*
* t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
*
@ -29,333 +43,240 @@
* note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254,
* so the special case a = 255 must be accounted or roundoff must be used
*/
#define GMBT_GEOMETRIC_SERIES 1
#define GMB_MULT_GS( MP1, MA1, MP2, MA2 ) \
PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\
TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\
;\
MOVQ ( MA1, MP1 ) ;\
TWO(MOVQ ( MA2, MP2 )) ;\
;\
PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\
TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\
;\
PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
;\
PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
/*
/* integer multiplication - geometric series plus rounding
*
* when using a geometric series division instead of truncating the result
* use roundoff in the approximation (Jim Blinn)
*
* t = rgb*a + 0x80
*
* achieving the exact results
*
* note that M80 is register with the 0x0080008000800080 constant
*/
#define GMBT_ROUNDOFF 0
#define GMB_MULT_GSR( MP1, MA1, MP2, MA2, M80 ) \
PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\
TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\
;\
PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\
TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\
;\
MOVQ ( MA1, MP1 ) ;\
TWO(MOVQ ( MA2, MP2 )) ;\
;\
PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\
TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\
;\
PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
;\
PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
/* instead of the roundoff this adds a small correction to satisfy the OpenGL criteria
/* linear interpolation - geometric series
*/
#define GMB_LERP_GS( MP1, MQ1, MA1, MP2, MQ2, MA2) \
PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\
TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\
;\
PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\
TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\
;\
PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\
TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\
;\
MOVQ ( MA1, MP1 ) ;\
TWO(MOVQ ( MA2, MP2 )) ;\
;\
PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\
TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\
;\
PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
;\
PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\
TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\
;\
PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
/* linear interpolation - geometric series with roundoff
*
* this is a generalization of Blinn's formula to signed arithmetic
*
* note that M80 is a register with the 0x0080008000800080 constant
*/
#define GMB_LERP_GSR( MP1, MQ1, MA1, MP2, MQ2, MA2, M80) \
PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\
TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\
;\
PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\
TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\
;\
PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\
TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\
;\
PSRLW ( CONST(15), MP1 ) /* q1 > p1 ? 1 : 0 */ ;\
TWO(PSRLW ( CONST(15), MP2 )) /* q2 > q2 ? 1 : 0 */ ;\
;\
PSLLW ( CONST(8), MP1 ) /* q1 > p1 ? 0x100 : 0 */ ;\
TWO(PSLLW ( CONST(8), MP2 )) /* q2 > q2 ? 0x100 : 0 */ ;\
;\
PSUBW ( MP1, MA1 ) /* t1 -=? 0x100 */ ;\
TWO(PSUBW ( MP2, MA2 )) /* t2 -=? 0x100 */ ;\
;\
PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\
TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\
;\
MOVQ ( MA1, MP1 ) ;\
TWO(MOVQ ( MA2, MP2 )) ;\
;\
PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\
TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\
;\
PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
;\
PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\
TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\
;\
PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
/* linear interpolation - geometric series with correction
*
* instead of the roundoff this adds a small correction to satisfy the OpenGL criteria
*
* t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8
*
* note that although is faster than rounding off it doesn't give always the exact results
*/
#define GMBT_GEOMETRIC_CORRECTION 1
#define GMB_LERP_GSC( MP1, MQ1, MA1, MP2, MQ2, MA2) \
PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\
TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\
;\
PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\
TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\
;\
PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\
TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\
;\
MOVQ ( MA1, MP1 ) ;\
TWO(MOVQ ( MA2, MP2 )) ;\
;\
PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\
TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\
;\
PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
;\
PSRLW ( CONST(7), MP1 ) /* t1 >> 15 */ ;\
TWO(PSRLW ( CONST(7), MP2 )) /* t2 >> 15 */ ;\
;\
PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ ;\
TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8 */ ;\
;\
PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\
TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\
;\
PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
#if GMBT_ROUNDOFF
/* common blending initialization code
*/
#if 0 /* rounding not used */
SEG_DATA
ALIGNDATA8
const_80:
D_LONG 0x00800080, 0x00800080
#endif
#define GMB_INIT( M00, M80 ) \
PXOR ( M00, M00 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */
MOVQ ( CONTENT(const_80), M80 ) /* 0xffff | 0xffff | 0xffff | 0xffff */
#else
#define GMB_INIT( M00 ) \
PXOR ( M00, M00 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */
#endif
/* common blending loading code
*
* note that M00 is a register with 0x0000000000000000 constant which can be easily obtained making
*
* PXOR ( M00, M00 )
*/
#define GMB_LOAD(rgba, dest, MP1, MQ1, MA1, MP2, MQ2, MA2, M00) \
ONE(MOVD ( REGIND(rgba), MP1 )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\
ONE(MOVD ( REGIND(dest), MQ1 )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\
;\
TWO(MOVQ ( REGIND(rgba), MP1 )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\
TWO(MOVQ ( REGIND(dest), MQ1 )) /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */ ;\
;\
TWO(MOVQ ( MP1, MP2 )) ;\
TWO(MOVQ ( MQ1, MQ2 )) ;\
;\
PUNPCKLBW ( M00, MQ1 ) /* qa1 | qb1 | qg1 | qr1 */ ;\
TWO(PUNPCKHBW ( M00, MQ2 )) /* qa2 | qb2 | qg2 | qr2 */ ;\
PUNPCKLBW ( M00, MP1 ) /* pa1 | pb1 | pg1 | pr1 */ ;\
TWO(PUNPCKHBW ( M00, MP2 )) /* pa2 | pb2 | pg2 | pr2 */ ;\
;\
MOVQ ( MP1, MA1 ) ;\
TWO(MOVQ ( MP2, MA2 )) ;\
;\
PUNPCKHWD ( MA1, MA1 ) /* pa1 | pa1 | | */ ;\
TWO(PUNPCKHWD ( MA2, MA2 )) /* pa2 | pa2 | | */ ;\
PUNPCKHDQ ( MA1, MA1 ) /* pa1 | pa1 | pa1 | pa1 */ ;\
TWO(PUNPCKHDQ ( MA2, MA2 )) /* pa2 | pa2 | pa2 | pa2 */
/* common blending storing code
*/
#define GMB_STORE(rgba, MA1, MA2) \
PACKUSWB ( MA2, MA1 ) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ;\
;\
ONE(MOVD ( MA1, REGIND(rgba) )) ;\
TWO(MOVQ ( MA1, REGIND(rgba) ))
SEG_TEXT
ALIGNTEXT16
GLOBL GLNAME(_mesa_mmx_blend_transparency)
/*
* void blend_transparency( GLcontext *ctx,
* GLuint n,
* const GLubyte mask[],
* GLchan rgba[][4],
* CONST GLchan dest[][4] )
*
* Common transparency blending mode.
/* common transparency blending mode
*/
GLNAME( _mesa_mmx_blend_transparency ):
PUSH_L ( EBP )
MOV_L ( ESP, EBP )
PUSH_L ( ESI )
PUSH_L ( EDI )
PUSH_L ( EBX )
#define TAG(x) x##_transparency
MOV_L ( REGOFF(12, EBP), ECX ) /* n */
CMP_L ( CONST(0), ECX)
JE ( LLBL (GMBT_return) )
#define INIT \
GMB_INIT( MM0 )
MOV_L ( REGOFF(16, EBP), EBX ) /* mask */
MOV_L ( REGOFF(20, EBP), EDI ) /* rgba */
MOV_L ( REGOFF(24, EBP), ESI ) /* dest */
TEST_L ( CONST(4), EDI ) /* align rgba on an 8-byte boundary */
JZ ( LLBL (GMBT_align_end) )
#define MAIN \
GMB_LOAD( EDI, ESI, MM1, MM2, MM3, MM4, MM5, MM6, MM0) ;\
GMB_LERP_GSC( MM1, MM2, MM3, MM4, MM5, MM6 ) ;\
GMB_STORE( EDI, MM3, MM6 )
CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */
JE ( LLBL (GMBT_align_continue) )
#include "mmx_blendtmp.h"
PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */
MOVD ( REGIND(ESI), MM1 ) /* | | | | qa1 | qb1 | qg1 | qr1 */
MOVD ( REGIND(EDI), MM2 ) /* | | | | pa1 | pb1 | pg1 | pr1 */
PUNPCKLBW ( MM0, MM1 ) /* qa1 | qb1 | qg1 | qr1 */
PUNPCKLBW ( MM0, MM2 ) /* pa1 | pb1 | pg1 | pr1 */
MOVQ ( MM2, MM3 )
PUNPCKHWD ( MM3, MM3 ) /* pa1 | pa1 | | */
PUNPCKHDQ ( MM3, MM3 ) /* pa1 | pa1 | pa1 | pa1 */
#if GMBT_ALPHA_PLUS_ONE
PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */
PSUBW ( MM4, MM3 ) /* pa1 + 1 | pa1 + 1 | pa1 + 1 | pa1 + 1 */
#endif
PSUBW ( MM1, MM2 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */
PSLLW ( CONST(8), MM1 ) /* q1 << 8 */
#if GMBT_ROUNDOFF
MOVQ ( MM2, MM4 )
#endif
PMULLW ( MM3, MM2 ) /* t1 = (q1 - p1)*pa1 */
#if GMBT_ROUNDOFF
PSRLW ( CONST(15), MM4 ) /* q1 > p1 ? 1 : 0 */
PSLLW ( CONST(8), MM4 ) /* q1 > p1 ? 0x100 : 0 */
PSUBW ( MM4, MM2 ) /* t1 -=? 0x100 */
#endif
#if GMBT_ROUNDOFF
MOVQ ( CONTENT(const_80), MM4 )
PADDW ( MM4, MM2 ) /* t1 += 0x80 */
#endif
#if GMBT_GEOMETRIC_SERIES
MOVQ ( MM2, MM3 )
PSRLW ( CONST(8), MM3 ) /* t1 >> 8 */
PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */
#if GMBT_GEOMETRIC_CORRECTION
PSRLW ( CONST(7), MM3 ) /* t1 >> 15 */
PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */
#endif
#endif
PADDW ( MM1, MM2 ) /* (t1/255 + q1) << 8 */
PSRLW ( CONST(8), MM2 ) /* sa1 | sb1 | sg1 | sr1 */
PACKUSWB ( MM0, MM2 ) /* | | | | sa1 | sb1 | sg1 | sr1 */
MOVD ( MM2, REGIND(EDI) )
LLBL (GMBT_align_continue):
DEC_L ( ECX ) /* n -= 1 */
INC_L ( EBX ) /* mask += 1 */
ADD_L ( CONST(4), EDI ) /* rgba += 1 */
ADD_L ( CONST(4), ESI ) /* dest += 1 */
LLBL (GMBT_align_end):
CMP_L ( CONST(2), ECX)
JB ( LLBL (GMBT_loop_end) )
ALIGNTEXT16
LLBL (GMBT_loop_begin):
CMP_W ( CONST(0), REGIND(EBX) ) /* *mask == 0 && *(mask + 1) == 0 */
JE ( LLBL (GMBT_loop_continue) )
/* NOTE: the instruction pairing when multiple pipelines are available must be checked */
PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */
MOVQ ( REGIND(ESI), MM7 ) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */
MOVQ ( REGIND(EDI), MM6 ) /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */
MOVQ ( MM7, MM1 )
MOVQ ( MM6, MM2 )
PUNPCKLBW ( MM0, MM1 ) /* qa1 | qb1 | qg1 | qr1 */
PUNPCKHBW ( MM0, MM7 ) /* qa2 | qb2 | qg2 | qr2 */
PUNPCKLBW ( MM0, MM2 ) /* pa1 | pb1 | pg1 | pr1 */
PUNPCKHBW ( MM0, MM6 ) /* pa2 | pb2 | pg2 | pr2 */
MOVQ ( MM2, MM3 )
MOVQ ( MM6, MM5 )
PUNPCKHWD ( MM3, MM3 ) /* pa1 | pa1 | | */
PUNPCKHWD ( MM5, MM5 ) /* pa2 | pa2 | | */
PUNPCKHDQ ( MM3, MM3 ) /* pa1 | pa1 | pa1 | pa1 */
PUNPCKHDQ ( MM5, MM5 ) /* pa2 | pa2 | pa2 | pa2 */
#if GMBT_ALPHA_PLUS_ONE
PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */
PSUBW ( MM4, MM3 ) /* pa1 + 1 | pa1 + 1 | pa1 + 1 | pa1 + 1 */
PSUBW ( MM4, MM5 ) /* pa2 + 1 | pa2 + 1 | pa2 + 1 | pa2 + 1 */
#endif
PSUBW ( MM1, MM2 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */
PSUBW ( MM7, MM6 ) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */
PSLLW ( CONST(8), MM1 ) /* q1 << 8 */
PSLLW ( CONST(8), MM7 ) /* q2 << 8 */
#if GMBT_ROUNDOFF
MOVQ ( MM2, MM0 )
MOVQ ( MM6, MM4 )
#endif
PMULLW ( MM3, MM2 ) /* t1 = (q1 - p1)*pa1 */
PMULLW ( MM5, MM6 ) /* t2 = (q2 - p2)*pa2 */
#if GMBT_ROUNDOFF
PSRLW ( CONST(15), MM0 ) /* q1 > p1 ? 1 : 0 */
PSRLW ( CONST(15), MM4 ) /* q2 > q2 ? 1 : 0 */
PSLLW ( CONST(8), MM0 ) /* q1 > p1 ? 0x100 : 0 */
PSLLW ( CONST(8), MM4 ) /* q2 > q2 ? 0x100 : 0 */
PSUBW ( MM0, MM2 ) /* t1 -=? 0x100 */
PSUBW ( MM4, MM7 ) /* t2 -=? 0x100 */
#endif
#if GMBT_ROUNDOFF
MOVQ ( CONTENT(const_80), MM4 )
PADDW ( MM4, MM2 ) /* t1 += 0x80 */
PADDW ( MM4, MM6 ) /* t2 += 0x80 */
#endif
#if GMBT_GEOMETRIC_SERIES
MOVQ ( MM2, MM3 )
MOVQ ( MM6, MM5 )
PSRLW ( CONST(8), MM3 ) /* t1 >> 8 */
PSRLW ( CONST(8), MM5 ) /* t2 >> 8 */
PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */
PADDW ( MM5, MM6 ) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */
#if GMBT_GEOMETRIC_CORRECTION
PSRLW ( CONST(7), MM3 ) /* t1 >> 15 */
PSRLW ( CONST(7), MM5 ) /* t2 >> 15 */
PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */
PADDW ( MM5, MM6 ) /* t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8 */
#endif
#endif
PADDW ( MM1, MM2 ) /* (t1/255 + q1) << 8 */
PADDW ( MM7, MM6 ) /* (t2/255 + q2) << 8 */
PSRLW ( CONST(8), MM2 ) /* sa1 | sb1 | sg1 | sr1 */
PSRLW ( CONST(8), MM6 ) /* sa2 | sb2 | sg2 | sr2 */
PACKUSWB ( MM6, MM2 ) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */
MOVQ ( MM2, REGIND(EDI) )
LLBL (GMBT_loop_continue):
DEC_L ( ECX )
DEC_L ( ECX ) /* n -= 2 */
ADD_L ( CONST(2), EBX ) /* mask += 2 */
ADD_L ( CONST(8), EDI ) /* rgba += 2 */
ADD_L ( CONST(8), ESI ) /* dest += 2 */
CMP_L ( CONST(2), ECX )
JAE ( LLBL (GMBT_loop_begin) )
LLBL (GMBT_loop_end):
CMP_L ( CONST(1), ECX )
JB ( LLBL (GMBT_done) )
CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */
JE ( LLBL (GMBT_done) )
PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */
MOVD ( REGIND(ESI), MM1 ) /* | | | | qa1 | qb1 | qg1 | qr1 */
MOVD ( REGIND(EDI), MM2 ) /* | | | | pa1 | pb1 | pg1 | pr1 */
PUNPCKLBW ( MM0, MM1 ) /* qa1 | qb1 | qg1 | qr1 */
PUNPCKLBW ( MM0, MM2 ) /* pa1 | pb1 | pg1 | pr1 */
MOVQ ( MM2, MM3 )
PUNPCKHWD ( MM3, MM3 ) /* pa1 | pa1 | | */
PUNPCKHDQ ( MM3, MM3 ) /* pa1 | pa1 | pa1 | pa1 */
#if GMBT_ALPHA_PLUS_ONE
PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */
PSUBW ( MM4, MM3 ) /* pa1 + 1 | pa1 + 1 | pa1 + 1 | pa1 + 1 */
#endif
PSUBW ( MM1, MM2 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */
PSLLW ( CONST(8), MM1 ) /* q1 << 8 */
#if GMBT_ROUNDOFF
MOVQ ( MM2, MM4 )
#endif
PMULLW ( MM3, MM2 ) /* t1 = (q1 - p1)*pa1 */
#if GMBT_ROUNDOFF
PSRLW ( CONST(15), MM4 ) /* q1 > p1 ? 1 : 0 */
PSLLW ( CONST(8), MM4 ) /* q1 > p1 ? 0x100 : 0 */
PSUBW ( MM4, MM2 ) /* t1 -=? 0x100 */
#endif
#if GMBT_ROUNDOFF
MOVQ ( CONTENT(const_80), MM4 )
PADDW ( MM4, MM2 ) /* t1 += 0x80 */
#endif
#if GMBT_GEOMETRIC_SERIES
MOVQ ( MM2, MM3 )
PSRLW ( CONST(8), MM3 ) /* t1 >> 8 */
PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */
#if GMBT_GEOMETRIC_CORRECTION
PSRLW ( CONST(7), MM3 ) /* t1 >> 15 */
PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */
#endif
#endif
PADDW ( MM1, MM2 ) /* (t1/255 + q1) << 8 */
PSRLW ( CONST(8), MM2 ) /* sa1 | sb1 | sg1 | sr1 */
PACKUSWB ( MM0, MM2 ) /* | | | | sa1 | sb1 | sg1 | sr1 */
MOVD ( MM2, REGIND(EDI) )
LLBL (GMBT_done):
EMMS
LLBL (GMBT_return):
POP_L ( EBX )
POP_L ( EDI )
POP_L ( ESI )
MOV_L ( EBP, ESP )
POP_L ( EBP )
RET

113
src/mesa/x86/mmx_blendtmp.h Normal file
View file

@ -0,0 +1,113 @@
/*
* Written by José Fonseca <j_r_fonseca@yahoo.co.uk>
*/
/*
* void _mesa_mmx_blend( GLcontext *ctx,
* GLuint n,
* const GLubyte mask[],
* GLchan rgba[][4],
* CONST GLchan dest[][4] )
*
*/
ALIGNTEXT16
GLOBL GLNAME( TAG(_mesa_mmx_blend) )
GLNAME( TAG(_mesa_mmx_blend) ):
PUSH_L ( EBP )
MOV_L ( ESP, EBP )
PUSH_L ( ESI )
PUSH_L ( EDI )
PUSH_L ( EBX )
MOV_L ( REGOFF(12, EBP), ECX ) /* n */
CMP_L ( CONST(0), ECX)
JE ( LLBL ( TAG(GMB_return) ) )
MOV_L ( REGOFF(16, EBP), EBX ) /* mask */
MOV_L ( REGOFF(20, EBP), EDI ) /* rgba */
MOV_L ( REGOFF(24, EBP), ESI ) /* dest */
INIT
TEST_L ( CONST(4), EDI ) /* align rgba on an 8-byte boundary */
JZ ( LLBL ( TAG(GMB_align_end) ) )
CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */
JE ( LLBL ( TAG(GMB_align_continue) ) )
/* runin */
#define ONE(x) x
#define TWO(x)
MAIN
#undef ONE
#undef TWO
LLBL ( TAG(GMB_align_continue) ):
DEC_L ( ECX ) /* n -= 1 */
INC_L ( EBX ) /* mask += 1 */
ADD_L ( CONST(4), EDI ) /* rgba += 1 */
ADD_L ( CONST(4), ESI ) /* dest += 1 */
LLBL ( TAG(GMB_align_end) ):
CMP_L ( CONST(2), ECX)
JB ( LLBL ( TAG(GMB_loop_end) ) )
ALIGNTEXT16
LLBL ( TAG(GMB_loop_begin) ):
CMP_W ( CONST(0), REGIND(EBX) ) /* *mask == 0 && *(mask + 1) == 0 */
JE ( LLBL ( TAG(GMB_loop_continue) ) )
/* main loop */
#define ONE(x)
#define TWO(x) x
MAIN
#undef ONE
#undef TWO
LLBL ( TAG(GMB_loop_continue) ):
DEC_L ( ECX )
DEC_L ( ECX ) /* n -= 2 */
ADD_L ( CONST(2), EBX ) /* mask += 2 */
ADD_L ( CONST(8), EDI ) /* rgba += 2 */
ADD_L ( CONST(8), ESI ) /* dest += 2 */
CMP_L ( CONST(2), ECX )
JAE ( LLBL ( TAG(GMB_loop_begin) ) )
LLBL ( TAG(GMB_loop_end) ):
CMP_L ( CONST(1), ECX )
JB ( LLBL ( TAG(GMB_done) ) )
CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */
JE ( LLBL ( TAG(GMB_done) ) )
/* runout */
#define ONE(x) x
#define TWO(x)
MAIN
#undef ONE
#undef TWO
LLBL ( TAG(GMB_done) ):
EMMS
LLBL ( TAG(GMB_return) ):
POP_L ( EBX )
POP_L ( EDI )
POP_L ( ESI )
MOV_L ( EBP, ESP )
POP_L ( EBP )
RET
#undef TAG
#undef INIT
#undef MAIN