tgsi: reduce x86 reg usage in tgsi_sse generated programs

Pass the tgsi_exec_machine struct in directly and just hold a single
pointer to this struct, rather than keeping one for each of its
internal members.
This commit is contained in:
Keith Whitwell 2009-07-16 07:50:34 +01:00 committed by Keith Whitwell
parent 4e3002b50f
commit ebc4a9bf2e
4 changed files with 119 additions and 154 deletions

View file

@ -52,24 +52,12 @@
#define SSE_MAX_VERTICES 4
typedef void (PIPE_CDECL *codegen_function) (
const struct tgsi_exec_vector *input, /* 1 */
struct tgsi_exec_vector *output, /* 2 */
float (*constant)[4], /* 3 */
struct tgsi_exec_vector *temporary, /* 4 */
float (*immediates)[4], /* 5 */
const float (*aos_input)[4], /* 6 */
uint num_inputs, /* 7 */
uint input_stride, /* 8 */
float (*aos_output)[4], /* 9 */
uint num_outputs, /* 10 */
uint output_stride ); /* 11 */
struct draw_sse_vertex_shader {
struct draw_vertex_shader base;
struct x86_function sse2_program;
codegen_function func;
tgsi_sse2_vs_func func;
struct tgsi_exec_machine *machine;
};
@ -119,11 +107,9 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
/* run compiled shader
*/
shader->func(machine->Inputs,
machine->Outputs,
(float (*)[4])constants,
machine->Temps,
(float (*)[4])shader->base.immediates,
shader->func(machine,
constants,
shader->base.immediates,
input,
base->info.num_inputs,
input_stride,
@ -195,7 +181,7 @@ draw_create_vs_sse(struct draw_context *draw,
TRUE ))
goto fail;
vs->func = (codegen_function) x86_get_func( &vs->sse2_program );
vs->func = (tgsi_sse2_vs_func) x86_get_func( &vs->sse2_program );
if (!vs->func) {
goto fail;
}

View file

@ -32,6 +32,7 @@
#include "util/u_debug.h"
#include "pipe/p_shader_tokens.h"
#include "util/u_math.h"
#include "util/u_memory.h"
#if defined(PIPE_ARCH_SSE)
#include "util/u_sse.h"
#endif
@ -104,41 +105,51 @@ get_const_base( void )
}
static struct x86_reg
get_input_base( void )
get_machine_base( void )
{
return x86_make_reg(
file_REG32,
reg_AX );
}
static struct x86_reg
get_input_base( void )
{
return x86_make_disp(
get_machine_base(),
Offset(struct tgsi_exec_machine, Inputs) );
}
static struct x86_reg
get_output_base( void )
{
return x86_make_reg(
file_REG32,
reg_DX );
return x86_make_disp(
get_machine_base(),
Offset(struct tgsi_exec_machine, Outputs) );
}
static struct x86_reg
get_temp_base( void )
{
return x86_make_disp(
get_machine_base(),
Offset(struct tgsi_exec_machine, Temps) );
}
static struct x86_reg
get_coef_base( void )
{
return x86_make_reg(
file_REG32,
reg_BX );
}
static struct x86_reg
get_coef_base( void )
{
return get_output_base();
}
static struct x86_reg
get_immediate_base( void )
{
return x86_make_reg(
file_REG32,
reg_DI );
reg_DX );
}
@ -2551,7 +2562,7 @@ emit_declaration(
static void aos_to_soa( struct x86_function *func,
uint arg_aos,
uint arg_soa,
uint arg_machine,
uint arg_num,
uint arg_stride )
{
@ -2566,7 +2577,10 @@ static void aos_to_soa( struct x86_function *func,
x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) );
x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
x86_lea( func, soa_input,
x86_make_disp( soa_input,
Offset(struct tgsi_exec_machine, Inputs) ) );
x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
@ -2608,28 +2622,30 @@ static void aos_to_soa( struct x86_function *func,
x86_jcc( func, cc_NE, inner_loop );
/* Restore EBX */
x86_pop( func, aos_input );
x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
}
static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
static void soa_to_aos( struct x86_function *func,
uint arg_aos,
uint arg_machine,
uint arg_num,
uint arg_stride )
{
struct x86_reg soa_output;
struct x86_reg aos_output;
struct x86_reg num_outputs;
struct x86_reg temp;
struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
int inner_loop;
soa_output = x86_make_reg( file_REG32, reg_AX );
aos_output = x86_make_reg( file_REG32, reg_BX );
num_outputs = x86_make_reg( file_REG32, reg_CX );
temp = x86_make_reg( file_REG32, reg_DX );
/* Save EBX */
x86_push( func, aos_output );
x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
x86_lea( func, soa_output,
x86_make_disp( soa_output,
Offset(struct tgsi_exec_machine, Outputs) ) );
x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
/* do */
inner_loop = x86_get_label( func );
@ -2646,7 +2662,7 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num,
sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
x86_mov( func, temp, x86_fn_arg( func, stride ) );
x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
x86_push( func, aos_output );
sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
@ -2670,20 +2686,13 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num,
x86_jcc( func, cc_NE, inner_loop );
/* Restore EBX */
x86_pop( func, aos_output );
x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
}
/**
* Translate a TGSI vertex/fragment shader to SSE2 code.
* Slightly different things are done for vertex vs. fragment shaders.
*
* Note that fragment shaders are responsible for interpolating shader
* inputs. Because on x86 we have only 4 GP registers, and here we
* have 5 shader arguments (input, output, const, temp and coef), the
* code is split into two phases -- DECLARATION and INSTRUCTION phase.
* GP register holding the output argument is aliased with the coeff
* argument, as outputs are not needed in the DECLARATION phase.
*
* \param tokens the TGSI input shader
* \param func the output SSE code/function
* \param immediates buffer to place immediates, later passed to SSE func
@ -2697,7 +2706,6 @@ tgsi_emit_sse2(
boolean do_swizzles )
{
struct tgsi_parse_context parse;
boolean instruction_phase = FALSE;
unsigned ok = 1;
uint num_immediates = 0;
@ -2709,74 +2717,42 @@ tgsi_emit_sse2(
/* Can't just use EDI, EBX without save/restoring them:
*/
x86_push(
func,
get_immediate_base() );
x86_push(
func,
get_temp_base() );
x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
/*
* Different function args for vertex/fragment shaders:
*/
if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
/* DECLARATION phase, do not load output argument. */
x86_mov(
func,
get_input_base(),
x86_fn_arg( func, 1 ) );
/* skipping outputs argument here */
x86_mov(
func,
get_const_base(),
x86_fn_arg( func, 3 ) );
x86_mov(
func,
get_temp_base(),
x86_fn_arg( func, 4 ) );
x86_mov(
func,
get_coef_base(),
x86_fn_arg( func, 5 ) );
x86_mov(
func,
get_immediate_base(),
x86_fn_arg( func, 6 ) );
}
else {
assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
if (do_swizzles)
aos_to_soa( func,
6, /* aos_input */
1, /* machine->input */
7, /* num_inputs */
8 ); /* input_stride */
x86_mov(
func,
get_input_base(),
x86_fn_arg( func, 1 ) );
x86_mov(
func,
get_output_base(),
x86_fn_arg( func, 2 ) );
x86_mov(
func,
get_const_base(),
x86_fn_arg( func, 3 ) );
x86_mov(
func,
get_temp_base(),
x86_fn_arg( func, 4 ) );
x86_mov(
func,
get_immediate_base(),
x86_fn_arg( func, 5 ) );
4, /* aos_input */
1, /* machine */
5, /* num_inputs */
6 ); /* input_stride */
}
x86_mov(
func,
get_machine_base(),
x86_fn_arg( func, 1 ) );
x86_mov(
func,
get_const_base(),
x86_fn_arg( func, 2 ) );
x86_mov(
func,
get_immediate_base(),
x86_fn_arg( func, 3 ) );
if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
x86_mov(
func,
get_coef_base(),
x86_fn_arg( func, 4 ) );
}
while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
tgsi_parse_token( &parse );
@ -2790,17 +2766,6 @@ tgsi_emit_sse2(
break;
case TGSI_TOKEN_TYPE_INSTRUCTION:
if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
if( !instruction_phase ) {
/* INSTRUCTION phase, overwrite coeff with output. */
instruction_phase = TRUE;
x86_mov(
func,
get_output_base(),
x86_fn_arg( func, 2 ) );
}
}
ok = emit_instruction(
func,
&parse.FullToken.FullInstruction );
@ -2844,18 +2809,17 @@ tgsi_emit_sse2(
if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
if (do_swizzles)
soa_to_aos( func, 9, 2, 10, 11 );
soa_to_aos( func,
7, /* aos_output */
1, /* machine */
8, /* num_outputs */
9 ); /* output_stride */
}
/* Can't just use EBX, EDI without save/restoring them:
*/
x86_pop(
func,
get_temp_base() );
x86_pop(
func,
get_immediate_base() );
x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
emit_ret( func );

View file

@ -34,6 +34,7 @@ extern "C" {
struct tgsi_token;
struct x86_function;
struct tgsi_interp_coef;
unsigned
tgsi_emit_sse2(
@ -42,6 +43,33 @@ tgsi_emit_sse2(
float (*immediates)[4],
boolean do_swizzles );
/* This is the function prototype generated when do_swizzles is false
* -- effectively for fragment shaders.
*/
typedef void (PIPE_CDECL *tgsi_sse2_fs_function) (
struct tgsi_exec_machine *machine, /* 1 */
const float (*constant)[4], /* 2 */
const float (*immediate)[4], /* 3 */
const struct tgsi_interp_coef *coef /* 4 */
);
/* This is the function prototype generated when do_swizzles is true
* -- effectively for vertex shaders.
*/
typedef void (PIPE_CDECL *tgsi_sse2_vs_func) (
struct tgsi_exec_machine *machine, /* 1 */
const float (*constant)[4], /* 2 */
const float (*immediate)[4], /* 3 */
const float (*aos_input)[4], /* 4 */
uint num_inputs, /* 5 */
uint input_stride, /* 6 */
float (*aos_output)[4], /* 7 */
uint num_outputs, /* 8 */
uint output_stride ); /* 9 */
#if defined __cplusplus
}
#endif

View file

@ -45,17 +45,6 @@
#include "rtasm/rtasm_x86sse.h"
/* Surely this should be defined somewhere in a tgsi header:
*/
typedef void (PIPE_CDECL *codegen_function)(
const struct tgsi_exec_vector *input,
struct tgsi_exec_vector *output,
const float (*constant)[4],
struct tgsi_exec_vector *temporary,
const struct tgsi_interp_coef *coef,
float (*immediates)[4]
//, const struct tgsi_exec_vector *quadPos
);
/**
@ -65,7 +54,7 @@ struct sp_sse_fragment_shader
{
struct sp_fragment_shader base;
struct x86_function sse2_program;
codegen_function func;
tgsi_sse2_fs_function func;
float immediates[TGSI_EXEC_NUM_IMMEDIATES][4];
};
@ -107,12 +96,10 @@ fs_sse_run( const struct sp_fragment_shader *base,
tgsi_set_kill_mask(machine, 0x0);
tgsi_set_exec_mask(machine, 1, 1, 1, 1);
shader->func( machine->Inputs,
machine->Outputs,
shader->func( machine,
machine->Consts,
machine->Temps,
machine->InterpCoefs,
shader->immediates
(const float (*)[4])shader->immediates,
machine->InterpCoefs
// , &machine->QuadPos
);
@ -151,7 +138,7 @@ softpipe_create_fs_sse(struct softpipe_context *softpipe,
return NULL;
}
shader->func = (codegen_function) x86_get_func( &shader->sse2_program );
shader->func = (tgsi_sse2_fs_function) x86_get_func( &shader->sse2_program );
if (!shader->func) {
x86_release_func( &shader->sse2_program );
FREE(shader);