tgsi: SSE2 optimized exp2, log2 and pow implementations.

Special care must be taken when calling compiler generated SSE2 functions
from the runtime generated SSE2: saving the xmm registers, and notify gcc
the stack is not 16byte aligned.

It would be more efficient to keep the stack pointer 16byte aligned, but
too hairy, and not consistent in all x86 architectures.

This has been tested in linux x86 and windows x86 userspace. Not tested on
x86-64 because it is broken for other reasons (even without this change).
This commit is contained in:
José Fonseca 2008-09-30 01:12:52 +09:00
parent 906336cd7c
commit 5dc8e67078

View file

@ -28,6 +28,7 @@
#include "pipe/p_debug.h"
#include "pipe/p_shader_tokens.h"
#include "util/u_math.h"
#include "util/u_sse.h"
#include "tgsi/tgsi_parse.h"
#include "tgsi/tgsi_util.h"
#include "tgsi_exec.h"
@ -480,10 +481,31 @@ emit_coef_dady(
* Function call helpers.
*/
/**
* NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
* defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
* that the stack pointer is 16 byte aligned, as expected.
*/
static void
emit_push_gp(
struct x86_function *func )
emit_func_call_dst(
struct x86_function *func,
unsigned xmm_save,
unsigned xmm_dst,
void (PIPE_CDECL *code)() )
{
struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
unsigned i, n, xmm;
unsigned xmm_mask;
/* Bitmask of the xmm registers to save */
xmm_mask = (1 << xmm_save) - 1;
xmm_mask &= ~(1 << xmm_dst);
sse_movaps(
func,
get_temp( TEMP_R0, 0 ),
make_xmm( xmm_dst ) );
x86_push(
func,
x86_make_reg( file_REG32, reg_AX) );
@ -493,12 +515,49 @@ emit_push_gp(
x86_push(
func,
x86_make_reg( file_REG32, reg_DX) );
}
for(i = 0, n = 0; i < 8; ++i)
if(xmm_mask & (1 << i))
++n;
x86_sub_imm(
func,
x86_make_reg( file_REG32, reg_SP ),
n*16);
for(i = 0, n = 0; i < 8; ++i)
if(xmm_mask & (1 << i)) {
sse_movups(
func,
x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
make_xmm( xmm ) );
++n;
}
x86_lea(
func,
ecx,
get_temp( TEMP_R0, 0 ) );
x86_push( func, ecx );
x86_mov_reg_imm( func, ecx, (unsigned long) code );
x86_call( func, ecx );
x86_pop(func, ecx );
for(i = 0, n = 0; i < 8; ++i)
if(xmm_mask & (1 << i)) {
sse_movups(
func,
make_xmm( xmm ),
x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
++n;
}
x86_add_imm(
func,
x86_make_reg( file_REG32, reg_SP ),
n*16);
static void
x86_pop_gp(
struct x86_function *func )
{
/* Restore GP registers in a reverse order.
*/
x86_pop(
@ -510,39 +569,6 @@ x86_pop_gp(
x86_pop(
func,
x86_make_reg( file_REG32, reg_AX) );
}
static void
emit_func_call_dst(
struct x86_function *func,
unsigned xmm_dst,
void (PIPE_CDECL *code)() )
{
sse_movaps(
func,
get_temp( TEMP_R0, 0 ),
make_xmm( xmm_dst ) );
emit_push_gp(
func );
{
struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
x86_lea(
func,
ecx,
get_temp( TEMP_R0, 0 ) );
x86_push( func, ecx );
x86_mov_reg_imm( func, ecx, (unsigned long) code );
x86_call( func, ecx );
x86_pop(func, ecx );
}
x86_pop_gp(
func );
sse_movaps(
func,
@ -553,6 +579,7 @@ emit_func_call_dst(
static void
emit_func_call_dst_src(
struct x86_function *func,
unsigned xmm_save,
unsigned xmm_dst,
unsigned xmm_src,
void (PIPE_CDECL *code)() )
@ -564,10 +591,111 @@ emit_func_call_dst_src(
emit_func_call_dst(
func,
xmm_save,
xmm_dst,
code );
}
/*
* Fast SSE2 implementation of special math functions.
*/
#define POLY0(x, c0) _mm_set1_ps(c0)
#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
#define EXP_POLY_DEGREE 3
#define LOG_POLY_DEGREE 5
/**
* See http://www.devmaster.net/forums/showthread.php?p=43580
*/
static INLINE __m128
exp2f4(__m128 x)
{
__m128i ipart;
__m128 fpart, expipart, expfpart;
x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
/* ipart = int(x - 0.5) */
ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
/* fpart = x - ipart */
fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
/* expipart = (float) (1 << ipart) */
expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
/* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
#if EXP_POLY_DEGREE == 5
expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
#elif EXP_POLY_DEGREE == 4
expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
#elif EXP_POLY_DEGREE == 3
expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
#elif EXP_POLY_DEGREE == 2
expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
#else
#error
#endif
return _mm_mul_ps(expipart, expfpart);
}
/**
* See http://www.devmaster.net/forums/showthread.php?p=43580
*/
static INLINE __m128
log2f4(__m128 x)
{
__m128i expmask = _mm_set1_epi32(0x7f800000);
__m128i mantmask = _mm_set1_epi32(0x007fffff);
__m128 one = _mm_set1_ps(1.0f);
__m128i i = _mm_castps_si128(x);
/* exp = (float) exponent(x) */
__m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
/* mant = (float) mantissa(x) */
__m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
__m128 logmant;
/* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
* These coefficients can be generate with
* http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
*/
#if LOG_POLY_DEGREE == 6
logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
#elif LOG_POLY_DEGREE == 5
logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
#elif LOG_POLY_DEGREE == 4
logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
#elif LOG_POLY_DEGREE == 3
logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
#else
#error
#endif
/* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
return _mm_add_ps(logmant, exp);
}
static INLINE __m128
powf4(__m128 x, __m128 y)
{
return exp2f4(_mm_mul_ps(log2f4(x), y));
}
/**
* Low-level instruction translators.
*/
@ -610,38 +738,35 @@ cos4f(
static void
emit_cos(
struct x86_function *func,
unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
xmm_save,
xmm_dst,
cos4f );
}
static void PIPE_CDECL
#if defined(PIPE_CC_GCC)
__attribute__((force_align_arg_pointer))
#endif
ex24f(
float *store )
{
#if FAST_MATH
store[0] = util_fast_exp2( store[0] );
store[1] = util_fast_exp2( store[1] );
store[2] = util_fast_exp2( store[2] );
store[3] = util_fast_exp2( store[3] );
#else
store[0] = powf( 2.0f, store[0] );
store[1] = powf( 2.0f, store[1] );
store[2] = powf( 2.0f, store[2] );
store[3] = powf( 2.0f, store[3] );
#endif
_mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
}
static void
emit_ex2(
struct x86_function *func,
unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
xmm_save,
xmm_dst,
ex24f );
}
@ -670,10 +795,12 @@ flr4f(
static void
emit_flr(
struct x86_function *func,
unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
xmm_save,
xmm_dst,
flr4f );
}
@ -691,31 +818,35 @@ frc4f(
static void
emit_frc(
struct x86_function *func,
unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
xmm_save,
xmm_dst,
frc4f );
}
static void PIPE_CDECL
#if defined(PIPE_CC_GCC)
__attribute__((force_align_arg_pointer))
#endif
lg24f(
float *store )
{
store[0] = util_fast_log2( store[0] );
store[1] = util_fast_log2( store[1] );
store[2] = util_fast_log2( store[2] );
store[3] = util_fast_log2( store[3] );
_mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
}
static void
emit_lg2(
struct x86_function *func,
unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
xmm_save,
xmm_dst,
lg24f );
}
@ -757,14 +888,14 @@ emit_neg(
}
static void PIPE_CDECL
#if defined(PIPE_CC_GCC)
__attribute__((force_align_arg_pointer))
#endif
pow4f(
float *store )
{
#if FAST_MATH
store[0] = util_fast_pow( store[0], store[4] );
store[1] = util_fast_pow( store[1], store[5] );
store[2] = util_fast_pow( store[2], store[6] );
store[3] = util_fast_pow( store[3], store[7] );
#if 1
_mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
#else
store[0] = powf( store[0], store[4] );
store[1] = powf( store[1], store[5] );
@ -776,11 +907,13 @@ pow4f(
static void
emit_pow(
struct x86_function *func,
unsigned xmm_save,
unsigned xmm_dst,
unsigned xmm_src )
{
emit_func_call_dst_src(
func,
xmm_save,
xmm_dst,
xmm_src,
pow4f );
@ -873,10 +1006,12 @@ sin4f(
static void
emit_sin (struct x86_function *func,
unsigned xmm_save,
unsigned xmm_dst)
{
emit_func_call_dst(
func,
xmm_save,
xmm_dst,
sin4f );
}
@ -1296,7 +1431,7 @@ emit_instruction(
get_temp(
TGSI_EXEC_TEMP_MINUS_128_I,
TGSI_EXEC_TEMP_MINUS_128_C ) );
emit_pow( func, 1, 2 );
emit_pow( func, 3, 1, 2 );
FETCH( func, *inst, 0, 0, CHAN_X );
sse_xorps(
func,
@ -1342,11 +1477,11 @@ emit_instruction(
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
emit_MOV( func, 1, 0 );
emit_flr( func, 1 );
emit_flr( func, 2, 1 );
/* dst.x = ex2(floor(src.x)) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
emit_MOV( func, 2, 1 );
emit_ex2( func, 2 );
emit_ex2( func, 3, 2 );
STORE( func, *inst, 2, 0, CHAN_X );
}
/* dst.y = src.x - floor(src.x) */
@ -1358,7 +1493,7 @@ emit_instruction(
}
/* dst.z = ex2(src.x) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
emit_ex2( func, 0 );
emit_ex2( func, 3, 0 );
STORE( func, *inst, 0, 0, CHAN_Z );
}
}
@ -1376,21 +1511,21 @@ emit_instruction(
FETCH( func, *inst, 0, 0, CHAN_X );
emit_abs( func, 0 );
emit_MOV( func, 1, 0 );
emit_lg2( func, 1 );
emit_lg2( func, 2, 1 );
/* dst.z = lg2(abs(src.x)) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
STORE( func, *inst, 1, 0, CHAN_Z );
}
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
emit_flr( func, 1 );
emit_flr( func, 2, 1 );
/* dst.x = floor(lg2(abs(src.x))) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
STORE( func, *inst, 1, 0, CHAN_X );
}
/* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
emit_ex2( func, 1 );
emit_ex2( func, 2, 1 );
emit_rcp( func, 1, 1 );
emit_mul( func, 0, 1 );
STORE( func, *inst, 0, 0, CHAN_Y );
@ -1580,7 +1715,7 @@ emit_instruction(
/* TGSI_OPCODE_FRC */
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
emit_frc( func, 0 );
emit_frc( func, 0, 0 );
STORE( func, *inst, 0, 0, chan_index );
}
break;
@ -1593,7 +1728,7 @@ emit_instruction(
/* TGSI_OPCODE_FLR */
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
emit_flr( func, 0 );
emit_flr( func, 0, 0 );
STORE( func, *inst, 0, 0, chan_index );
}
break;
@ -1605,7 +1740,7 @@ emit_instruction(
case TGSI_OPCODE_EXPBASE2:
/* TGSI_OPCODE_EX2 */
FETCH( func, *inst, 0, 0, CHAN_X );
emit_ex2( func, 0 );
emit_ex2( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@ -1614,7 +1749,7 @@ emit_instruction(
case TGSI_OPCODE_LOGBASE2:
/* TGSI_OPCODE_LG2 */
FETCH( func, *inst, 0, 0, CHAN_X );
emit_lg2( func, 0 );
emit_lg2( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@ -1624,7 +1759,7 @@ emit_instruction(
/* TGSI_OPCODE_POW */
FETCH( func, *inst, 0, 0, CHAN_X );
FETCH( func, *inst, 1, 1, CHAN_X );
emit_pow( func, 0, 1 );
emit_pow( func, 0, 0, 1 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@ -1715,7 +1850,7 @@ emit_instruction(
case TGSI_OPCODE_COS:
FETCH( func, *inst, 0, 0, CHAN_X );
emit_cos( func, 0 );
emit_cos( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@ -1774,7 +1909,7 @@ emit_instruction(
case TGSI_OPCODE_SIN:
FETCH( func, *inst, 0, 0, CHAN_X );
emit_sin( func, 0 );
emit_sin( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@ -1868,12 +2003,12 @@ emit_instruction(
case TGSI_OPCODE_SCS:
IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
FETCH( func, *inst, 0, 0, CHAN_X );
emit_cos( func, 0 );
emit_cos( func, 0, 0 );
STORE( func, *inst, 0, 0, CHAN_X );
}
IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
FETCH( func, *inst, 0, 0, CHAN_X );
emit_sin( func, 0 );
emit_sin( func, 0, 0 );
STORE( func, *inst, 0, 0, CHAN_Y );
}
IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {