r300: better register allocator for vertex shaders

This is a port of the fp pair regalloc. It is however much simpler as
contrary to the fp, we don't have to care about texturing, we can use
any swizzle we want and we don't have to track the inputs. Using the
register class machinery might actually be a slight overkill right now,
however the infrastructure will hopefully come in handy if someone
decides to implement the vp pair scheduling eventually.

Shader-db stats:

RV530:
total temps in shared programs: 18594 -> 17000 (-8.57%)
temps in affected programs: 5753 -> 4159 (-27.71%)
helped: 665
HURT: 0

RV370:
total temps in shared programs: 13555 -> 12181 (-10.14%)
temps in affected programs: 5116 -> 3742 (-26.86%)
helped: 633
HURT: 0

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/5972

Signed-off-by: Pavel Ondračka <pavel.ondracka@gmail.com>
Reviewed-by: Filip Gawin <filip@gawin.net>
Tested-by: Filip Gawin <filip@gawin.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19618>
This commit is contained in:
Pavel Ondračka 2022-10-25 21:41:48 +02:00 committed by Marge Bot
parent 0889c44f18
commit 54f6e72b27
4 changed files with 52 additions and 65 deletions

View file

@ -34,6 +34,8 @@
#include "radeon_swizzle.h"
#include "radeon_emulate_branches.h"
#include "radeon_remove_constants.h"
#include "radeon_regalloc.h"
#include "radeon_list.h"
#include "util/compiler.h"
@ -610,82 +612,64 @@ static int get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, b
static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
{
struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
struct rc_instruction *inst;
struct rc_instruction *end_loop = NULL;
unsigned int num_orig_temps = 0;
bool hwtemps[RC_REGISTER_MAX_INDEX];
struct temporary_allocation * ta;
unsigned int i;
memset(hwtemps, 0, sizeof(hwtemps));
unsigned int node_count, node_index;
struct ra_class ** node_classes;
struct rc_list * var_ptr;
struct rc_list * variables;
struct ra_graph * graph;
const struct rc_regalloc_state *ra_state = c->regalloc_state;
rc_recompute_ips(c);
/* Pass 1: Count original temporaries. */
for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
/* Get list of program variables */
variables = rc_get_variables(c);
node_count = rc_list_count(variables);
node_classes = memory_pool_malloc(&c->Pool,
node_count * sizeof(struct ra_class *));
for (i = 0; i < opcode->NumSrcRegs; ++i) {
if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
}
}
if (opcode->HasDstReg) {
if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
if (inst->U.I.DstReg.Index >= num_orig_temps)
num_orig_temps = inst->U.I.DstReg.Index + 1;
}
for (var_ptr = variables, node_index = 0; var_ptr;
var_ptr = var_ptr->Next, node_index++) {
unsigned int class_index = 0;
int index;
/* Compute the live intervals */
rc_variable_compute_live_intervals(var_ptr->Item);
unsigned int writemask = rc_variable_writemask_sum(var_ptr->Item);
index = rc_find_class(c->regalloc_state->class_list, writemask, 6);
if (index > -1) {
class_index = c->regalloc_state->class_list[index].ID;
} else {
rc_error(c,
"Could not find class for index=%u mask=%u\n",
((struct rc_variable *)var_ptr->Item)->Dst.Index, writemask);
}
node_classes[node_index] = ra_state->classes[class_index];
}
ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
sizeof(struct temporary_allocation) * num_orig_temps);
memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
graph = ra_alloc_interference_graph(ra_state->regs, node_count);
/* Pass 2: Determine original temporary lifetimes */
for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
/* Instructions inside of loops need to use the ENDLOOP
* instruction as their LastRead. */
if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP)
end_loop = rc_match_bgnloop(inst);
if (inst == end_loop) {
end_loop = NULL;
continue;
}
for (i = 0; i < opcode->NumSrcRegs; ++i) {
if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;
}
}
for (node_index = 0; node_index < node_count; node_index++) {
ra_set_node_class(graph, node_index, node_classes[node_index]);
}
/* Pass 3: Register allocation */
for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
rc_build_interference_graph(graph, variables);
for (i = 0; i < opcode->NumSrcRegs; ++i) {
if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
unsigned int orig = inst->U.I.SrcReg[i].Index;
inst->U.I.SrcReg[i].Index = get_reg(c, ta, hwtemps, orig);
if (ta[orig].Allocated && inst == ta[orig].LastRead)
hwtemps[ta[orig].HwTemp] = false;
}
}
if (opcode->HasDstReg) {
if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
unsigned int orig = inst->U.I.DstReg.Index;
inst->U.I.DstReg.Index = get_reg(c, ta, hwtemps, orig);
}
}
if (!ra_allocate(graph)) {
rc_error(c, "Ran out of hardware temporaries\n");
return;
}
/* Rewrite the registers */
for (var_ptr = variables, node_index = 0; var_ptr;
var_ptr = var_ptr->Next, node_index++) {
int reg = ra_get_node_reg(graph, node_index);
unsigned int writemask = reg_get_writemask(reg);
unsigned int index = reg_get_index(reg);
struct rc_variable * var = var_ptr->Item;
rc_variable_change_dst(var, index, writemask);
}
ralloc_free(graph);
}
/**

View file

@ -491,6 +491,7 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
/* Register allocator state */
rc_init_regalloc_state(&r300->fs_regalloc_state, RC_FRAGMENT_PROGRAM);
rc_init_regalloc_state(&r300->vs_regalloc_state, RC_VERTEX_PROGRAM);
/* Print driver info. */
#ifdef DEBUG

View file

@ -637,6 +637,8 @@ struct r300_context {
/* Compiler state. */
struct rc_regalloc_state fs_regalloc_state; /* Register allocator info for
* fragment shaders. */
struct rc_regalloc_state vs_regalloc_state; /* Register allocator info for
* vertex shaders. */
};
#define foreach_atom(r300, atom) \

View file

@ -190,7 +190,7 @@ void r300_translate_vertex_shader(struct r300_context *r300,
/* Setup the compiler */
memset(&compiler, 0, sizeof(compiler));
rc_init(&compiler.Base, NULL);
rc_init(&compiler.Base, &r300->vs_regalloc_state);
DBG_ON(r300, DBG_VP) ? compiler.Base.Debug |= RC_DBG_LOG : 0;
compiler.code = &vs->code;