mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-26 15:00:10 +01:00
Move tgsi machine state init/allocations so they're done less frequently.
This, plus expanding all instructions ahead of time, seems to have improved the performance of program execution by 8x or so.
This commit is contained in:
parent
57d3770f35
commit
0d13ade0cd
5 changed files with 179 additions and 142 deletions
|
|
@ -47,6 +47,8 @@
|
|||
#include "draw_vertex.h"
|
||||
|
||||
#include "x86/rtasm/x86sse.h"
|
||||
#include "pipe/tgsi/exec/tgsi_core.h"
|
||||
|
||||
|
||||
/**
|
||||
* Basic vertex info.
|
||||
|
|
@ -187,6 +189,8 @@ struct draw_context
|
|||
unsigned prim; /**< current prim type: PIPE_PRIM_x */
|
||||
unsigned reduced_prim;
|
||||
|
||||
/** TGSI program interpreter runtime state */
|
||||
struct tgsi_exec_machine machine;
|
||||
|
||||
/* Post-tnl vertex cache:
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -86,7 +86,7 @@ run_vertex_program(struct draw_context *draw,
|
|||
unsigned elts[4], unsigned count,
|
||||
struct vertex_header *vOut[])
|
||||
{
|
||||
struct tgsi_exec_machine machine;
|
||||
struct tgsi_exec_machine *machine = &draw->machine;
|
||||
unsigned int j;
|
||||
|
||||
ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX);
|
||||
|
|
@ -98,35 +98,39 @@ run_vertex_program(struct draw_context *draw,
|
|||
assert(draw->vertex_shader->state->output_semantic_name[0]
|
||||
== TGSI_SEMANTIC_POSITION);
|
||||
|
||||
#ifdef DEBUG
|
||||
memset( &machine, 0, sizeof( machine ) );
|
||||
#ifdef DEBUG_foo
|
||||
memset( machine, 0, sizeof( *machine ) );
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
/* init machine state */
|
||||
tgsi_exec_machine_init(&machine,
|
||||
tgsi_exec_machine_init(machine,
|
||||
draw->vertex_shader->state->tokens,
|
||||
PIPE_MAX_SAMPLERS,
|
||||
NULL /*samplers*/ );
|
||||
#endif
|
||||
|
||||
/* Consts does not require 16 byte alignment. */
|
||||
machine.Consts = (float (*)[4]) draw->mapped_constants;
|
||||
machine->Consts = (float (*)[4]) draw->mapped_constants;
|
||||
|
||||
machine.Inputs = ALIGN16_ASSIGN(inputs);
|
||||
machine.Outputs = ALIGN16_ASSIGN(outputs);
|
||||
machine->Inputs = ALIGN16_ASSIGN(inputs);
|
||||
machine->Outputs = ALIGN16_ASSIGN(outputs);
|
||||
|
||||
draw_vertex_fetch( draw, &machine, elts, count );
|
||||
draw_vertex_fetch( draw, machine, elts, count );
|
||||
|
||||
/* run shader */
|
||||
if( draw->vertex_shader->state->executable != NULL ) {
|
||||
/* SSE */
|
||||
codegen_function func = (codegen_function) draw->vertex_shader->state->executable;
|
||||
func(
|
||||
machine.Inputs,
|
||||
machine.Outputs,
|
||||
machine.Consts,
|
||||
machine.Temps );
|
||||
machine->Inputs,
|
||||
machine->Outputs,
|
||||
machine->Consts,
|
||||
machine->Temps );
|
||||
}
|
||||
else {
|
||||
tgsi_exec_machine_run( &machine );
|
||||
/* interpreter */
|
||||
tgsi_exec_machine_run( machine );
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -136,10 +140,10 @@ run_vertex_program(struct draw_context *draw,
|
|||
float x, y, z, w;
|
||||
|
||||
/* Handle attr[0] (position) specially: */
|
||||
x = vOut[j]->clip[0] = machine.Outputs[0].xyzw[0].f[j];
|
||||
y = vOut[j]->clip[1] = machine.Outputs[0].xyzw[1].f[j];
|
||||
z = vOut[j]->clip[2] = machine.Outputs[0].xyzw[2].f[j];
|
||||
w = vOut[j]->clip[3] = machine.Outputs[0].xyzw[3].f[j];
|
||||
x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
|
||||
y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
|
||||
z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
|
||||
w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
|
||||
|
||||
vOut[j]->clipmask = compute_clipmask(x, y, z, w) | draw->user_clipmask;
|
||||
vOut[j]->edgeflag = 1;
|
||||
|
|
@ -162,10 +166,10 @@ run_vertex_program(struct draw_context *draw,
|
|||
* Subtract two because of the VERTEX_HEADER, CLIP_POS attribs.
|
||||
*/
|
||||
for (slot = 1; slot < draw->vertex_info.num_attribs - 2; slot++) {
|
||||
vOut[j]->data[slot][0] = machine.Outputs[slot].xyzw[0].f[j];
|
||||
vOut[j]->data[slot][1] = machine.Outputs[slot].xyzw[1].f[j];
|
||||
vOut[j]->data[slot][2] = machine.Outputs[slot].xyzw[2].f[j];
|
||||
vOut[j]->data[slot][3] = machine.Outputs[slot].xyzw[3].f[j];
|
||||
vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
|
||||
vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
|
||||
vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
|
||||
vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
|
||||
/*
|
||||
printf("output %d: %f %f %f %f\n", slot,
|
||||
vOut[j]->data[slot][0],
|
||||
|
|
@ -235,6 +239,12 @@ void draw_bind_vertex_shader(struct draw_context *draw,
|
|||
{
|
||||
draw_flush(draw);
|
||||
draw->vertex_shader = (struct draw_vertex_shader*)(vcso);
|
||||
|
||||
/* init machine state */
|
||||
tgsi_exec_machine_init(&draw->machine,
|
||||
draw->vertex_shader->state->tokens,
|
||||
PIPE_MAX_SAMPLERS,
|
||||
NULL /*samplers*/ );
|
||||
}
|
||||
|
||||
void draw_delete_vertex_shader(struct draw_context *draw,
|
||||
|
|
|
|||
|
|
@ -45,6 +45,8 @@ struct quad_shade_stage
|
|||
{
|
||||
struct quad_stage stage;
|
||||
struct tgsi_sampler samplers[PIPE_MAX_SAMPLERS];
|
||||
struct tgsi_exec_machine machine;
|
||||
struct tgsi_exec_vector *inputs, *outputs;
|
||||
};
|
||||
|
||||
|
||||
|
|
@ -83,58 +85,41 @@ shade_quad(
|
|||
struct softpipe_context *softpipe = qs->softpipe;
|
||||
const float fx = (float) quad->x0;
|
||||
const float fy = (float) quad->y0;
|
||||
struct tgsi_exec_machine machine;
|
||||
|
||||
ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX);
|
||||
ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_ATTRIB_MAX);
|
||||
|
||||
#ifdef DEBUG
|
||||
memset( &machine, 0, sizeof( machine ) );
|
||||
#endif
|
||||
|
||||
/* init machine state */
|
||||
tgsi_exec_machine_init(
|
||||
&machine,
|
||||
softpipe->fs->tokens,
|
||||
PIPE_MAX_SAMPLERS,
|
||||
qss->samplers );
|
||||
struct tgsi_exec_machine *machine = &qss->machine;
|
||||
|
||||
/* Consts does not require 16 byte alignment. */
|
||||
machine.Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
|
||||
machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
|
||||
|
||||
machine.Inputs = ALIGN16_ASSIGN(inputs);
|
||||
machine.Outputs = ALIGN16_ASSIGN(outputs);
|
||||
machine->InterpCoefs = quad->coef;
|
||||
|
||||
machine.InterpCoefs = quad->coef;
|
||||
machine->Inputs[0].xyzw[0].f[0] = fx;
|
||||
machine->Inputs[0].xyzw[0].f[1] = fx + 1.0f;
|
||||
machine->Inputs[0].xyzw[0].f[2] = fx;
|
||||
machine->Inputs[0].xyzw[0].f[3] = fx + 1.0f;
|
||||
|
||||
machine.Inputs[0].xyzw[0].f[0] = fx;
|
||||
machine.Inputs[0].xyzw[0].f[1] = fx + 1.0f;
|
||||
machine.Inputs[0].xyzw[0].f[2] = fx;
|
||||
machine.Inputs[0].xyzw[0].f[3] = fx + 1.0f;
|
||||
|
||||
machine.Inputs[0].xyzw[1].f[0] = fy;
|
||||
machine.Inputs[0].xyzw[1].f[1] = fy;
|
||||
machine.Inputs[0].xyzw[1].f[2] = fy + 1.0f;
|
||||
machine.Inputs[0].xyzw[1].f[3] = fy + 1.0f;
|
||||
machine->Inputs[0].xyzw[1].f[0] = fy;
|
||||
machine->Inputs[0].xyzw[1].f[1] = fy;
|
||||
machine->Inputs[0].xyzw[1].f[2] = fy + 1.0f;
|
||||
machine->Inputs[0].xyzw[1].f[3] = fy + 1.0f;
|
||||
|
||||
/* run shader */
|
||||
if( softpipe->fs->executable != NULL ) {
|
||||
codegen_function func = (codegen_function) softpipe->fs->executable;
|
||||
func(
|
||||
machine.Inputs,
|
||||
machine.Outputs,
|
||||
machine.Consts,
|
||||
machine.Temps,
|
||||
machine.InterpCoefs );
|
||||
machine->Inputs,
|
||||
machine->Outputs,
|
||||
machine->Consts,
|
||||
machine->Temps,
|
||||
machine->InterpCoefs );
|
||||
}
|
||||
else {
|
||||
tgsi_exec_machine_run( &machine );
|
||||
tgsi_exec_machine_run( machine );
|
||||
}
|
||||
|
||||
/* store result color (always in output[1]) */
|
||||
memcpy(
|
||||
quad->outputs.color,
|
||||
&machine.Outputs[1].xyzw[0].f[0],
|
||||
&machine->Outputs[1].xyzw[0].f[0],
|
||||
sizeof( quad->outputs.color ) );
|
||||
|
||||
#if 0
|
||||
|
|
@ -142,14 +127,14 @@ shade_quad(
|
|||
/* XXX temporary */
|
||||
memcpy(
|
||||
quad->outputs.depth,
|
||||
&machine.Outputs[0].xyzw[2],
|
||||
machine->Outputs[0].xyzw[2],
|
||||
sizeof( quad->outputs.depth ) );
|
||||
}
|
||||
#else
|
||||
{
|
||||
uint i;
|
||||
for (i = 0; i < 4; i++) {
|
||||
quad->outputs.depth[i] = machine.Inputs[0].xyzw[2].f[i];
|
||||
quad->outputs.depth[i] = machine->Inputs[0].xyzw[2].f[i];
|
||||
#if 0
|
||||
printf("output z %f\n", quad->outputs.depth[i]);
|
||||
#endif
|
||||
|
|
@ -188,6 +173,12 @@ static void shade_begin(struct quad_stage *qs)
|
|||
}
|
||||
}
|
||||
|
||||
/* XXX only do this if the fragment shader changes... */
|
||||
tgsi_exec_machine_init(&qss->machine,
|
||||
softpipe->fs->tokens,
|
||||
PIPE_MAX_SAMPLERS,
|
||||
qss->samplers );
|
||||
|
||||
if (qs->next)
|
||||
qs->next->begin(qs->next);
|
||||
}
|
||||
|
|
@ -195,11 +186,17 @@ static void shade_begin(struct quad_stage *qs)
|
|||
|
||||
struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe )
|
||||
{
|
||||
struct quad_shade_stage *stage = CALLOC_STRUCT(quad_shade_stage);
|
||||
struct quad_shade_stage *qss = CALLOC_STRUCT(quad_shade_stage);
|
||||
|
||||
stage->stage.softpipe = softpipe;
|
||||
stage->stage.begin = shade_begin;
|
||||
stage->stage.run = shade_quad;
|
||||
/* allocate storage for program inputs/outputs, aligned to 16 bytes */
|
||||
qss->inputs = malloc(PIPE_ATTRIB_MAX * sizeof(*qss->inputs) + 16);
|
||||
qss->outputs = malloc(PIPE_ATTRIB_MAX * sizeof(*qss->outputs) + 16);
|
||||
qss->machine.Inputs = align16(qss->inputs);
|
||||
qss->machine.Outputs = align16(qss->outputs);
|
||||
|
||||
return &stage->stage;
|
||||
qss->stage.softpipe = softpipe;
|
||||
qss->stage.begin = shade_begin;
|
||||
qss->stage.run = shade_quad;
|
||||
|
||||
return &qss->stage;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -65,6 +65,80 @@
|
|||
#define CHAN_Z 2
|
||||
#define CHAN_W 3
|
||||
|
||||
|
||||
static void
|
||||
expand_program(struct tgsi_exec_machine *mach )
|
||||
{
|
||||
struct tgsi_full_instruction *instructions;
|
||||
struct tgsi_full_declaration *declarations;
|
||||
struct tgsi_parse_context parse;
|
||||
uint k;
|
||||
uint maxInstructions = 10, numInstructions = 0;
|
||||
uint maxDeclarations = 10, numDeclarations = 0;
|
||||
|
||||
k = tgsi_parse_init( &parse, mach->Tokens );
|
||||
if (k != TGSI_PARSE_OK) {
|
||||
printf("Problem parsing!\n");
|
||||
return;
|
||||
}
|
||||
|
||||
declarations = (struct tgsi_full_declaration *)
|
||||
malloc(maxDeclarations * sizeof(struct tgsi_full_declaration));
|
||||
|
||||
instructions = (struct tgsi_full_instruction *)
|
||||
malloc(maxInstructions * sizeof(struct tgsi_full_instruction));
|
||||
|
||||
while( !tgsi_parse_end_of_tokens( &parse ) ) {
|
||||
tgsi_parse_token( &parse );
|
||||
switch( parse.FullToken.Token.Type ) {
|
||||
case TGSI_TOKEN_TYPE_DECLARATION:
|
||||
/*
|
||||
exec_declaration( mach, &parse.FullToken.FullDeclaration );
|
||||
*/
|
||||
if (numDeclarations == maxDeclarations) {
|
||||
maxDeclarations += 10;
|
||||
declarations = realloc(declarations,
|
||||
maxDeclarations
|
||||
* sizeof(struct tgsi_full_instruction));
|
||||
}
|
||||
memcpy(declarations + numDeclarations,
|
||||
&parse.FullToken.FullInstruction,
|
||||
sizeof(declarations[0]));
|
||||
numDeclarations++;
|
||||
break;
|
||||
case TGSI_TOKEN_TYPE_IMMEDIATE:
|
||||
break;
|
||||
case TGSI_TOKEN_TYPE_INSTRUCTION:
|
||||
if (numInstructions == maxInstructions) {
|
||||
maxInstructions += 10;
|
||||
instructions = realloc(instructions,
|
||||
maxInstructions
|
||||
* sizeof(struct tgsi_full_instruction));
|
||||
}
|
||||
memcpy(instructions + numInstructions,
|
||||
&parse.FullToken.FullInstruction,
|
||||
sizeof(instructions[0]));
|
||||
numInstructions++;
|
||||
break;
|
||||
default:
|
||||
assert( 0 );
|
||||
}
|
||||
}
|
||||
tgsi_parse_free (&parse);
|
||||
|
||||
assert(!mach->Instructions);
|
||||
mach->Instructions = instructions;
|
||||
mach->NumInstructions = numInstructions;
|
||||
mach->Declarations = declarations;
|
||||
mach->NumDeclarations = numDeclarations;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Initialize machine state by expanding tokens to full instructions,
|
||||
* allocating temporary storage, setting up constants, etc.
|
||||
* After this, we can call tgsi_exec_machine_run() many times.
|
||||
*/
|
||||
void
|
||||
tgsi_exec_machine_init(
|
||||
struct tgsi_exec_machine *mach,
|
||||
|
|
@ -103,16 +177,32 @@ tgsi_exec_machine_init(
|
|||
mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
|
||||
}
|
||||
|
||||
if (mach->Declarations) {
|
||||
free(mach->Declarations);
|
||||
mach->Declarations = NULL;
|
||||
mach->NumDeclarations = 0;
|
||||
}
|
||||
if (mach->Instructions) {
|
||||
free(mach->Instructions);
|
||||
mach->Instructions = NULL;
|
||||
mach->NumInstructions = 0;
|
||||
}
|
||||
|
||||
mach->CondMask = 0xf;
|
||||
mach->LoopMask = 0xf;
|
||||
mach->ExecMask = 0xf;
|
||||
|
||||
#if 01
|
||||
tgsi_exec_prepare( mach );
|
||||
expand_program(mach);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
tgsi_exec_prepare(
|
||||
struct tgsi_exec_machine *mach,
|
||||
struct tgsi_exec_labels *labels )
|
||||
struct tgsi_exec_machine *mach )
|
||||
{
|
||||
struct tgsi_exec_labels *labels = &mach->Labels;
|
||||
struct tgsi_parse_context parse;
|
||||
GLuint k;
|
||||
GLuint instno = 0;
|
||||
|
|
@ -164,10 +254,10 @@ void
|
|||
tgsi_exec_machine_run(
|
||||
struct tgsi_exec_machine *mach )
|
||||
{
|
||||
struct tgsi_exec_labels labels;
|
||||
|
||||
tgsi_exec_prepare( mach, &labels );
|
||||
tgsi_exec_machine_run2( mach, &labels );
|
||||
#if 0
|
||||
tgsi_exec_prepare( mach );
|
||||
#endif
|
||||
tgsi_exec_machine_run2( mach );
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
@ -2170,77 +2260,9 @@ exec_instruction(
|
|||
}
|
||||
|
||||
|
||||
static void
|
||||
expand_program(struct tgsi_exec_machine *mach )
|
||||
{
|
||||
struct tgsi_full_instruction *instructions;
|
||||
struct tgsi_full_declaration *declarations;
|
||||
struct tgsi_parse_context parse;
|
||||
uint k;
|
||||
uint maxInstructions = 10, numInstructions = 0;
|
||||
uint maxDeclarations = 10, numDeclarations = 0;
|
||||
|
||||
k = tgsi_parse_init( &parse, mach->Tokens );
|
||||
if (k != TGSI_PARSE_OK) {
|
||||
printf("Problem parsing!\n");
|
||||
return;
|
||||
}
|
||||
|
||||
declarations = (struct tgsi_full_declaration *)
|
||||
malloc(maxDeclarations * sizeof(struct tgsi_full_declaration));
|
||||
|
||||
instructions = (struct tgsi_full_instruction *)
|
||||
malloc(maxInstructions * sizeof(struct tgsi_full_instruction));
|
||||
|
||||
while( !tgsi_parse_end_of_tokens( &parse ) ) {
|
||||
tgsi_parse_token( &parse );
|
||||
switch( parse.FullToken.Token.Type ) {
|
||||
case TGSI_TOKEN_TYPE_DECLARATION:
|
||||
/*
|
||||
exec_declaration( mach, &parse.FullToken.FullDeclaration );
|
||||
*/
|
||||
if (numDeclarations == maxDeclarations) {
|
||||
maxDeclarations += 10;
|
||||
declarations = realloc(declarations,
|
||||
maxDeclarations
|
||||
* sizeof(struct tgsi_full_instruction));
|
||||
}
|
||||
memcpy(declarations + numDeclarations,
|
||||
&parse.FullToken.FullInstruction,
|
||||
sizeof(declarations[0]));
|
||||
numDeclarations++;
|
||||
break;
|
||||
case TGSI_TOKEN_TYPE_IMMEDIATE:
|
||||
break;
|
||||
case TGSI_TOKEN_TYPE_INSTRUCTION:
|
||||
if (numInstructions == maxInstructions) {
|
||||
maxInstructions += 10;
|
||||
instructions = realloc(instructions,
|
||||
maxInstructions
|
||||
* sizeof(struct tgsi_full_instruction));
|
||||
}
|
||||
memcpy(instructions + numInstructions,
|
||||
&parse.FullToken.FullInstruction,
|
||||
sizeof(instructions[0]));
|
||||
numInstructions++;
|
||||
break;
|
||||
default:
|
||||
assert( 0 );
|
||||
}
|
||||
}
|
||||
tgsi_parse_free (&parse);
|
||||
|
||||
mach->Instructions = instructions;
|
||||
mach->NumInstructions = numInstructions;
|
||||
mach->Declarations = declarations;
|
||||
mach->NumDeclarations = numDeclarations;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
tgsi_exec_machine_run2(
|
||||
struct tgsi_exec_machine *mach,
|
||||
struct tgsi_exec_labels *labels )
|
||||
struct tgsi_exec_machine *mach )
|
||||
{
|
||||
#if 0 && MESA
|
||||
GET_CURRENT_CONTEXT(ctx);
|
||||
|
|
@ -2255,9 +2277,11 @@ tgsi_exec_machine_run2(
|
|||
GLuint k;
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
if (!mach->Instructions) {
|
||||
expand_program(mach);
|
||||
}
|
||||
#endif
|
||||
|
||||
mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
|
||||
mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
|
||||
|
|
@ -2305,8 +2329,10 @@ tgsi_exec_machine_run2(
|
|||
exec_instruction( mach, mach->Instructions + pc, &pc );
|
||||
}
|
||||
|
||||
#if 0
|
||||
free(mach->Declarations);
|
||||
free(mach->Instructions);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -154,6 +154,8 @@ struct tgsi_exec_machine
|
|||
|
||||
struct tgsi_full_declaration *Declarations;
|
||||
uint NumDeclarations;
|
||||
|
||||
struct tgsi_exec_labels Labels;
|
||||
};
|
||||
|
||||
|
||||
|
|
@ -166,8 +168,7 @@ tgsi_exec_machine_init(
|
|||
|
||||
void
|
||||
tgsi_exec_prepare(
|
||||
struct tgsi_exec_machine *mach,
|
||||
struct tgsi_exec_labels *labels );
|
||||
struct tgsi_exec_machine *mach );
|
||||
|
||||
void
|
||||
tgsi_exec_machine_run(
|
||||
|
|
@ -175,8 +176,7 @@ tgsi_exec_machine_run(
|
|||
|
||||
void
|
||||
tgsi_exec_machine_run2(
|
||||
struct tgsi_exec_machine *mach,
|
||||
struct tgsi_exec_labels *labels );
|
||||
struct tgsi_exec_machine *mach );
|
||||
|
||||
#if defined __cplusplus
|
||||
} // extern "C"
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue