mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 11:18:08 +02:00
tgsi: Implement fast rsqrtf. Not tested, inactive.
This commit is contained in:
parent
5e49037caa
commit
17058e0746
3 changed files with 40 additions and 19 deletions
|
|
@ -88,6 +88,10 @@
|
|||
#define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C
|
||||
#define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I
|
||||
#define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C
|
||||
#define TEMP_3_I TGSI_EXEC_TEMP_THREE_I
|
||||
#define TEMP_3_C TGSI_EXEC_TEMP_THREE_C
|
||||
#define TEMP_HALF_I TGSI_EXEC_TEMP_HALF_I
|
||||
#define TEMP_HALF_C TGSI_EXEC_TEMP_HALF_C
|
||||
#define TEMP_R0 TGSI_EXEC_TEMP_R0
|
||||
|
||||
#define FOR_EACH_CHANNEL(CHAN)\
|
||||
|
|
@ -262,6 +266,8 @@ tgsi_exec_machine_init(
|
|||
mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
|
||||
mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
|
||||
mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
|
||||
mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
|
||||
mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -133,9 +133,15 @@ struct tgsi_exec_labels
|
|||
#define TGSI_EXEC_TEMP_PRIMITIVE_I 34
|
||||
#define TGSI_EXEC_TEMP_PRIMITIVE_C 2
|
||||
|
||||
#define TGSI_EXEC_TEMP_R0 35
|
||||
#define TGSI_EXEC_TEMP_THREE_I 34
|
||||
#define TGSI_EXEC_TEMP_THREE_C 3
|
||||
|
||||
#define TGSI_EXEC_NUM_TEMPS (32 + 4)
|
||||
#define TGSI_EXEC_TEMP_HALF_I 35
|
||||
#define TGSI_EXEC_TEMP_HALF_C 0
|
||||
|
||||
#define TGSI_EXEC_TEMP_R0 36
|
||||
|
||||
#define TGSI_EXEC_NUM_TEMPS (32 + 5)
|
||||
#define TGSI_EXEC_NUM_ADDRS 1
|
||||
#define TGSI_EXEC_NUM_IMMEDIATES 256
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,11 @@
|
|||
|
||||
#ifdef PIPE_ARCH_X86
|
||||
|
||||
#define HIGH_PRECISION 1 /* for 1/sqrt() */
|
||||
/* for 1/sqrt()
|
||||
*
|
||||
* This costs about 100fps (close to 10%) in gears:
|
||||
*/
|
||||
#define HIGH_PRECISION 1
|
||||
|
||||
|
||||
#define FOR_EACH_CHANNEL( CHAN )\
|
||||
|
|
@ -794,20 +798,25 @@ emit_rsqrt(
|
|||
*
|
||||
* See: http://softwarecommunity.intel.com/articles/eng/1818.htm
|
||||
*/
|
||||
/* This is some code that woudl do the above for a scalar 'a'. We
|
||||
* obviously are interested in a vector version:
|
||||
*
|
||||
* movss xmm3, a;
|
||||
* movss xmm1, half;
|
||||
* movss xmm2, three;
|
||||
* rsqrtss xmm0, xmm3;
|
||||
* mulss xmm3, xmm0;
|
||||
* mulss xmm1, xmm0;
|
||||
* mulss xmm3, xmm0;
|
||||
* subss xmm2, xmm3;
|
||||
* mulss xmm1, xmm2;
|
||||
* movss x, xmm1;
|
||||
*/
|
||||
{
|
||||
struct x86_reg dst = make_xmm( xmm_dst );
|
||||
struct x86_reg src = make_xmm( xmm_src );
|
||||
struct x86_reg tmp0 = make_xmm( 2 );
|
||||
struct x86_reg tmp1 = make_xmm( 3 );
|
||||
|
||||
assert( xmm_dst != xmm_src );
|
||||
assert( xmm_dst != 2 && xmm_dst != 3 );
|
||||
assert( xmm_src != 2 && xmm_src != 3 );
|
||||
|
||||
sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
|
||||
sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
|
||||
sse_rsqrtps( func, tmp1, src );
|
||||
sse_mulps( func, src, tmp1 );
|
||||
sse_mulps( func, dst, tmp1 );
|
||||
sse_mulps( func, src, tmp1 );
|
||||
sse_subps( func, tmp0, src );
|
||||
sse_mulps( func, dst, tmp0 );
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
/* On Intel CPUs at least, this is only accurate to 12 bits -- not
|
||||
|
|
@ -1295,9 +1304,9 @@ emit_instruction(
|
|||
case TGSI_OPCODE_RSQ:
|
||||
/* TGSI_OPCODE_RECIPSQRT */
|
||||
FETCH( func, *inst, 0, 0, CHAN_X );
|
||||
emit_rsqrt( func, 0, 0 );
|
||||
emit_rsqrt( func, 1, 0 );
|
||||
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
|
||||
STORE( func, *inst, 0, 0, chan_index );
|
||||
STORE( func, *inst, 1, 0, chan_index );
|
||||
}
|
||||
break;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue