Move tgsi machine state init/allocations so they're done less frequently.

This, plus expanding all instructions ahead of time, seems to have improved
the performance of program execution by 8x or so.
This commit is contained in:
Brian 2007-10-02 11:46:11 -06:00
parent 57d3770f35
commit 0d13ade0cd
5 changed files with 179 additions and 142 deletions

View file

@ -47,6 +47,8 @@
#include "draw_vertex.h"
#include "x86/rtasm/x86sse.h"
#include "pipe/tgsi/exec/tgsi_core.h"
/**
* Basic vertex info.
@ -187,6 +189,8 @@ struct draw_context
unsigned prim; /**< current prim type: PIPE_PRIM_x */
unsigned reduced_prim;
/** TGSI program interpreter runtime state */
struct tgsi_exec_machine machine;
/* Post-tnl vertex cache:
*/

View file

@ -86,7 +86,7 @@ run_vertex_program(struct draw_context *draw,
unsigned elts[4], unsigned count,
struct vertex_header *vOut[])
{
struct tgsi_exec_machine machine;
struct tgsi_exec_machine *machine = &draw->machine;
unsigned int j;
ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX);
@ -98,35 +98,39 @@ run_vertex_program(struct draw_context *draw,
assert(draw->vertex_shader->state->output_semantic_name[0]
== TGSI_SEMANTIC_POSITION);
#ifdef DEBUG
memset( &machine, 0, sizeof( machine ) );
#ifdef DEBUG_foo
memset( machine, 0, sizeof( *machine ) );
#endif
#if 0
/* init machine state */
tgsi_exec_machine_init(&machine,
tgsi_exec_machine_init(machine,
draw->vertex_shader->state->tokens,
PIPE_MAX_SAMPLERS,
NULL /*samplers*/ );
#endif
/* Consts does not require 16 byte alignment. */
machine.Consts = (float (*)[4]) draw->mapped_constants;
machine->Consts = (float (*)[4]) draw->mapped_constants;
machine.Inputs = ALIGN16_ASSIGN(inputs);
machine.Outputs = ALIGN16_ASSIGN(outputs);
machine->Inputs = ALIGN16_ASSIGN(inputs);
machine->Outputs = ALIGN16_ASSIGN(outputs);
draw_vertex_fetch( draw, &machine, elts, count );
draw_vertex_fetch( draw, machine, elts, count );
/* run shader */
if( draw->vertex_shader->state->executable != NULL ) {
/* SSE */
codegen_function func = (codegen_function) draw->vertex_shader->state->executable;
func(
machine.Inputs,
machine.Outputs,
machine.Consts,
machine.Temps );
machine->Inputs,
machine->Outputs,
machine->Consts,
machine->Temps );
}
else {
tgsi_exec_machine_run( &machine );
/* interpreter */
tgsi_exec_machine_run( machine );
}
@ -136,10 +140,10 @@ run_vertex_program(struct draw_context *draw,
float x, y, z, w;
/* Handle attr[0] (position) specially: */
x = vOut[j]->clip[0] = machine.Outputs[0].xyzw[0].f[j];
y = vOut[j]->clip[1] = machine.Outputs[0].xyzw[1].f[j];
z = vOut[j]->clip[2] = machine.Outputs[0].xyzw[2].f[j];
w = vOut[j]->clip[3] = machine.Outputs[0].xyzw[3].f[j];
x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
vOut[j]->clipmask = compute_clipmask(x, y, z, w) | draw->user_clipmask;
vOut[j]->edgeflag = 1;
@ -162,10 +166,10 @@ run_vertex_program(struct draw_context *draw,
* Subtract two because of the VERTEX_HEADER, CLIP_POS attribs.
*/
for (slot = 1; slot < draw->vertex_info.num_attribs - 2; slot++) {
vOut[j]->data[slot][0] = machine.Outputs[slot].xyzw[0].f[j];
vOut[j]->data[slot][1] = machine.Outputs[slot].xyzw[1].f[j];
vOut[j]->data[slot][2] = machine.Outputs[slot].xyzw[2].f[j];
vOut[j]->data[slot][3] = machine.Outputs[slot].xyzw[3].f[j];
vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
/*
printf("output %d: %f %f %f %f\n", slot,
vOut[j]->data[slot][0],
@ -235,6 +239,12 @@ void draw_bind_vertex_shader(struct draw_context *draw,
{
draw_flush(draw);
draw->vertex_shader = (struct draw_vertex_shader*)(vcso);
/* init machine state */
tgsi_exec_machine_init(&draw->machine,
draw->vertex_shader->state->tokens,
PIPE_MAX_SAMPLERS,
NULL /*samplers*/ );
}
void draw_delete_vertex_shader(struct draw_context *draw,

View file

@ -45,6 +45,8 @@ struct quad_shade_stage
{
struct quad_stage stage;
struct tgsi_sampler samplers[PIPE_MAX_SAMPLERS];
struct tgsi_exec_machine machine;
struct tgsi_exec_vector *inputs, *outputs;
};
@ -83,58 +85,41 @@ shade_quad(
struct softpipe_context *softpipe = qs->softpipe;
const float fx = (float) quad->x0;
const float fy = (float) quad->y0;
struct tgsi_exec_machine machine;
ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX);
ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_ATTRIB_MAX);
#ifdef DEBUG
memset( &machine, 0, sizeof( machine ) );
#endif
/* init machine state */
tgsi_exec_machine_init(
&machine,
softpipe->fs->tokens,
PIPE_MAX_SAMPLERS,
qss->samplers );
struct tgsi_exec_machine *machine = &qss->machine;
/* Consts does not require 16 byte alignment. */
machine.Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
machine.Inputs = ALIGN16_ASSIGN(inputs);
machine.Outputs = ALIGN16_ASSIGN(outputs);
machine->InterpCoefs = quad->coef;
machine.InterpCoefs = quad->coef;
machine->Inputs[0].xyzw[0].f[0] = fx;
machine->Inputs[0].xyzw[0].f[1] = fx + 1.0f;
machine->Inputs[0].xyzw[0].f[2] = fx;
machine->Inputs[0].xyzw[0].f[3] = fx + 1.0f;
machine.Inputs[0].xyzw[0].f[0] = fx;
machine.Inputs[0].xyzw[0].f[1] = fx + 1.0f;
machine.Inputs[0].xyzw[0].f[2] = fx;
machine.Inputs[0].xyzw[0].f[3] = fx + 1.0f;
machine.Inputs[0].xyzw[1].f[0] = fy;
machine.Inputs[0].xyzw[1].f[1] = fy;
machine.Inputs[0].xyzw[1].f[2] = fy + 1.0f;
machine.Inputs[0].xyzw[1].f[3] = fy + 1.0f;
machine->Inputs[0].xyzw[1].f[0] = fy;
machine->Inputs[0].xyzw[1].f[1] = fy;
machine->Inputs[0].xyzw[1].f[2] = fy + 1.0f;
machine->Inputs[0].xyzw[1].f[3] = fy + 1.0f;
/* run shader */
if( softpipe->fs->executable != NULL ) {
codegen_function func = (codegen_function) softpipe->fs->executable;
func(
machine.Inputs,
machine.Outputs,
machine.Consts,
machine.Temps,
machine.InterpCoefs );
machine->Inputs,
machine->Outputs,
machine->Consts,
machine->Temps,
machine->InterpCoefs );
}
else {
tgsi_exec_machine_run( &machine );
tgsi_exec_machine_run( machine );
}
/* store result color (always in output[1]) */
memcpy(
quad->outputs.color,
&machine.Outputs[1].xyzw[0].f[0],
&machine->Outputs[1].xyzw[0].f[0],
sizeof( quad->outputs.color ) );
#if 0
@ -142,14 +127,14 @@ shade_quad(
/* XXX temporary */
memcpy(
quad->outputs.depth,
&machine.Outputs[0].xyzw[2],
machine->Outputs[0].xyzw[2],
sizeof( quad->outputs.depth ) );
}
#else
{
uint i;
for (i = 0; i < 4; i++) {
quad->outputs.depth[i] = machine.Inputs[0].xyzw[2].f[i];
quad->outputs.depth[i] = machine->Inputs[0].xyzw[2].f[i];
#if 0
printf("output z %f\n", quad->outputs.depth[i]);
#endif
@ -188,6 +173,12 @@ static void shade_begin(struct quad_stage *qs)
}
}
/* XXX only do this if the fragment shader changes... */
tgsi_exec_machine_init(&qss->machine,
softpipe->fs->tokens,
PIPE_MAX_SAMPLERS,
qss->samplers );
if (qs->next)
qs->next->begin(qs->next);
}
@ -195,11 +186,17 @@ static void shade_begin(struct quad_stage *qs)
struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe )
{
struct quad_shade_stage *stage = CALLOC_STRUCT(quad_shade_stage);
struct quad_shade_stage *qss = CALLOC_STRUCT(quad_shade_stage);
stage->stage.softpipe = softpipe;
stage->stage.begin = shade_begin;
stage->stage.run = shade_quad;
/* allocate storage for program inputs/outputs, aligned to 16 bytes */
qss->inputs = malloc(PIPE_ATTRIB_MAX * sizeof(*qss->inputs) + 16);
qss->outputs = malloc(PIPE_ATTRIB_MAX * sizeof(*qss->outputs) + 16);
qss->machine.Inputs = align16(qss->inputs);
qss->machine.Outputs = align16(qss->outputs);
return &stage->stage;
qss->stage.softpipe = softpipe;
qss->stage.begin = shade_begin;
qss->stage.run = shade_quad;
return &qss->stage;
}

View file

@ -65,6 +65,80 @@
#define CHAN_Z 2
#define CHAN_W 3
static void
expand_program(struct tgsi_exec_machine *mach )
{
struct tgsi_full_instruction *instructions;
struct tgsi_full_declaration *declarations;
struct tgsi_parse_context parse;
uint k;
uint maxInstructions = 10, numInstructions = 0;
uint maxDeclarations = 10, numDeclarations = 0;
k = tgsi_parse_init( &parse, mach->Tokens );
if (k != TGSI_PARSE_OK) {
printf("Problem parsing!\n");
return;
}
declarations = (struct tgsi_full_declaration *)
malloc(maxDeclarations * sizeof(struct tgsi_full_declaration));
instructions = (struct tgsi_full_instruction *)
malloc(maxInstructions * sizeof(struct tgsi_full_instruction));
while( !tgsi_parse_end_of_tokens( &parse ) ) {
tgsi_parse_token( &parse );
switch( parse.FullToken.Token.Type ) {
case TGSI_TOKEN_TYPE_DECLARATION:
/*
exec_declaration( mach, &parse.FullToken.FullDeclaration );
*/
if (numDeclarations == maxDeclarations) {
maxDeclarations += 10;
declarations = realloc(declarations,
maxDeclarations
* sizeof(struct tgsi_full_instruction));
}
memcpy(declarations + numDeclarations,
&parse.FullToken.FullInstruction,
sizeof(declarations[0]));
numDeclarations++;
break;
case TGSI_TOKEN_TYPE_IMMEDIATE:
break;
case TGSI_TOKEN_TYPE_INSTRUCTION:
if (numInstructions == maxInstructions) {
maxInstructions += 10;
instructions = realloc(instructions,
maxInstructions
* sizeof(struct tgsi_full_instruction));
}
memcpy(instructions + numInstructions,
&parse.FullToken.FullInstruction,
sizeof(instructions[0]));
numInstructions++;
break;
default:
assert( 0 );
}
}
tgsi_parse_free (&parse);
assert(!mach->Instructions);
mach->Instructions = instructions;
mach->NumInstructions = numInstructions;
mach->Declarations = declarations;
mach->NumDeclarations = numDeclarations;
}
/**
* Initialize machine state by expanding tokens to full instructions,
* allocating temporary storage, setting up constants, etc.
* After this, we can call tgsi_exec_machine_run() many times.
*/
void
tgsi_exec_machine_init(
struct tgsi_exec_machine *mach,
@ -103,16 +177,32 @@ tgsi_exec_machine_init(
mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
}
if (mach->Declarations) {
free(mach->Declarations);
mach->Declarations = NULL;
mach->NumDeclarations = 0;
}
if (mach->Instructions) {
free(mach->Instructions);
mach->Instructions = NULL;
mach->NumInstructions = 0;
}
mach->CondMask = 0xf;
mach->LoopMask = 0xf;
mach->ExecMask = 0xf;
#if 01
tgsi_exec_prepare( mach );
expand_program(mach);
#endif
}
void
tgsi_exec_prepare(
struct tgsi_exec_machine *mach,
struct tgsi_exec_labels *labels )
struct tgsi_exec_machine *mach )
{
struct tgsi_exec_labels *labels = &mach->Labels;
struct tgsi_parse_context parse;
GLuint k;
GLuint instno = 0;
@ -164,10 +254,10 @@ void
tgsi_exec_machine_run(
struct tgsi_exec_machine *mach )
{
struct tgsi_exec_labels labels;
tgsi_exec_prepare( mach, &labels );
tgsi_exec_machine_run2( mach, &labels );
#if 0
tgsi_exec_prepare( mach );
#endif
tgsi_exec_machine_run2( mach );
}
static void
@ -2170,77 +2260,9 @@ exec_instruction(
}
static void
expand_program(struct tgsi_exec_machine *mach )
{
struct tgsi_full_instruction *instructions;
struct tgsi_full_declaration *declarations;
struct tgsi_parse_context parse;
uint k;
uint maxInstructions = 10, numInstructions = 0;
uint maxDeclarations = 10, numDeclarations = 0;
k = tgsi_parse_init( &parse, mach->Tokens );
if (k != TGSI_PARSE_OK) {
printf("Problem parsing!\n");
return;
}
declarations = (struct tgsi_full_declaration *)
malloc(maxDeclarations * sizeof(struct tgsi_full_declaration));
instructions = (struct tgsi_full_instruction *)
malloc(maxInstructions * sizeof(struct tgsi_full_instruction));
while( !tgsi_parse_end_of_tokens( &parse ) ) {
tgsi_parse_token( &parse );
switch( parse.FullToken.Token.Type ) {
case TGSI_TOKEN_TYPE_DECLARATION:
/*
exec_declaration( mach, &parse.FullToken.FullDeclaration );
*/
if (numDeclarations == maxDeclarations) {
maxDeclarations += 10;
declarations = realloc(declarations,
maxDeclarations
* sizeof(struct tgsi_full_instruction));
}
memcpy(declarations + numDeclarations,
&parse.FullToken.FullInstruction,
sizeof(declarations[0]));
numDeclarations++;
break;
case TGSI_TOKEN_TYPE_IMMEDIATE:
break;
case TGSI_TOKEN_TYPE_INSTRUCTION:
if (numInstructions == maxInstructions) {
maxInstructions += 10;
instructions = realloc(instructions,
maxInstructions
* sizeof(struct tgsi_full_instruction));
}
memcpy(instructions + numInstructions,
&parse.FullToken.FullInstruction,
sizeof(instructions[0]));
numInstructions++;
break;
default:
assert( 0 );
}
}
tgsi_parse_free (&parse);
mach->Instructions = instructions;
mach->NumInstructions = numInstructions;
mach->Declarations = declarations;
mach->NumDeclarations = numDeclarations;
}
void
tgsi_exec_machine_run2(
struct tgsi_exec_machine *mach,
struct tgsi_exec_labels *labels )
struct tgsi_exec_machine *mach )
{
#if 0 && MESA
GET_CURRENT_CONTEXT(ctx);
@ -2255,9 +2277,11 @@ tgsi_exec_machine_run2(
GLuint k;
#endif
#if 0
if (!mach->Instructions) {
expand_program(mach);
}
#endif
mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
@ -2305,8 +2329,10 @@ tgsi_exec_machine_run2(
exec_instruction( mach, mach->Instructions + pc, &pc );
}
#if 0
free(mach->Declarations);
free(mach->Instructions);
#endif
}
#endif

View file

@ -154,6 +154,8 @@ struct tgsi_exec_machine
struct tgsi_full_declaration *Declarations;
uint NumDeclarations;
struct tgsi_exec_labels Labels;
};
@ -166,8 +168,7 @@ tgsi_exec_machine_init(
void
tgsi_exec_prepare(
struct tgsi_exec_machine *mach,
struct tgsi_exec_labels *labels );
struct tgsi_exec_machine *mach );
void
tgsi_exec_machine_run(
@ -175,8 +176,7 @@ tgsi_exec_machine_run(
void
tgsi_exec_machine_run2(
struct tgsi_exec_machine *mach,
struct tgsi_exec_labels *labels );
struct tgsi_exec_machine *mach );
#if defined __cplusplus
} // extern "C"