mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-06 15:20:17 +01:00
i965/fs: Try a different pre-scheduling heuristic if the first spills.
Since LIFO fails on some shaders in one particular way, and non-LIFO systematically fails in another way on different kinds of shaders, try them both, and pick whichever one successfully register allocates first. Slightly prefer non-LIFO in case we produce extra dependencies in register allocation, since it should start out with fewer stalls than LIFO. This is madness, but I haven't come up with another way to get unigine tropics to not spill while keeping other programs from not spilling and retaining the non-unigine performance wins from texture-grf. total instructions in shared programs: 1626728 ->1626288(-0.03%) instructions in affected programs: 1015 -> 575 (-43.35%) GAINED: 50 LOST: 0 Improves Unigine Tropics performance by 14.5257% +/- 0.241838% (n=38) Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=70445 Cc: "10.0" <mesa-stable@lists.freedesktop.org> Reviewed-by: Matt Turner <mattst88@gmail.com> (cherry picked from commite9daead784)
This commit is contained in:
parent
99c62ff2ea
commit
3e6f200250
5 changed files with 76 additions and 54 deletions
|
|
@ -3286,15 +3286,28 @@ fs_visitor::run()
|
|||
assign_curb_setup();
|
||||
assign_urb_setup();
|
||||
|
||||
schedule_instructions(false);
|
||||
schedule_instructions(SCHEDULE_PRE_NON_LIFO);
|
||||
|
||||
if (0)
|
||||
assign_regs_trivial();
|
||||
else {
|
||||
while (!assign_regs()) {
|
||||
if (failed)
|
||||
break;
|
||||
}
|
||||
if (!assign_regs(false)) {
|
||||
/* Try a non-spilling register allocation again with a different
|
||||
* scheduling heuristic.
|
||||
*/
|
||||
schedule_instructions(SCHEDULE_PRE_LIFO);
|
||||
if (!assign_regs(false)) {
|
||||
if (dispatch_width == 16) {
|
||||
fail("Failure to register allocate. Reduce number of "
|
||||
"live scalar values to avoid this.");
|
||||
} else {
|
||||
while (!assign_regs(true)) {
|
||||
if (failed)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
assert(force_uncompressed_stack == 0);
|
||||
|
|
@ -3309,7 +3322,7 @@ fs_visitor::run()
|
|||
if (failed)
|
||||
return false;
|
||||
|
||||
schedule_instructions(true);
|
||||
schedule_instructions(SCHEDULE_POST);
|
||||
|
||||
if (dispatch_width == 8) {
|
||||
c->prog_data.reg_blocks = brw_register_blocks(grf_used);
|
||||
|
|
|
|||
|
|
@ -291,7 +291,7 @@ public:
|
|||
void assign_curb_setup();
|
||||
void calculate_urb_setup();
|
||||
void assign_urb_setup();
|
||||
bool assign_regs();
|
||||
bool assign_regs(bool allow_spilling);
|
||||
void assign_regs_trivial();
|
||||
void get_used_mrfs(bool *mrf_used);
|
||||
void setup_payload_interference(struct ra_graph *g, int payload_reg_count,
|
||||
|
|
@ -322,7 +322,7 @@ public:
|
|||
bool remove_dead_constants();
|
||||
bool remove_duplicate_mrf_writes();
|
||||
bool virtual_grf_interferes(int a, int b);
|
||||
void schedule_instructions(bool post_reg_alloc);
|
||||
void schedule_instructions(instruction_scheduler_mode mode);
|
||||
void insert_gen4_send_dependency_workarounds();
|
||||
void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst);
|
||||
void insert_gen4_post_send_dependency_workarounds(fs_inst *inst);
|
||||
|
|
|
|||
|
|
@ -417,7 +417,7 @@ fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
|
|||
}
|
||||
|
||||
bool
|
||||
fs_visitor::assign_regs()
|
||||
fs_visitor::assign_regs(bool allow_spilling)
|
||||
{
|
||||
/* Most of this allocation was written for a reg_width of 1
|
||||
* (dispatch_width == 8). In extending to 16-wide, the code was
|
||||
|
|
@ -496,14 +496,10 @@ fs_visitor::assign_regs()
|
|||
if (reg == -1) {
|
||||
fail("no register to spill:\n");
|
||||
dump_instructions();
|
||||
} else if (dispatch_width == 16) {
|
||||
fail("Failure to register allocate. Reduce number of live scalar "
|
||||
"values to avoid this.");
|
||||
} else {
|
||||
spill_reg(reg);
|
||||
} else if (allow_spilling) {
|
||||
spill_reg(reg);
|
||||
}
|
||||
|
||||
|
||||
ralloc_free(g);
|
||||
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -391,14 +391,16 @@ schedule_node::set_latency_gen7(bool is_haswell)
|
|||
|
||||
class instruction_scheduler {
|
||||
public:
|
||||
instruction_scheduler(backend_visitor *v, int grf_count, bool post_reg_alloc)
|
||||
instruction_scheduler(backend_visitor *v, int grf_count,
|
||||
instruction_scheduler_mode mode)
|
||||
{
|
||||
this->bv = v;
|
||||
this->mem_ctx = ralloc_context(NULL);
|
||||
this->grf_count = grf_count;
|
||||
this->instructions.make_empty();
|
||||
this->instructions_to_schedule = 0;
|
||||
this->post_reg_alloc = post_reg_alloc;
|
||||
this->post_reg_alloc = (mode == SCHEDULE_POST);
|
||||
this->mode = mode;
|
||||
this->time = 0;
|
||||
if (!post_reg_alloc) {
|
||||
this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count);
|
||||
|
|
@ -447,6 +449,8 @@ public:
|
|||
exec_list instructions;
|
||||
backend_visitor *bv;
|
||||
|
||||
instruction_scheduler_mode mode;
|
||||
|
||||
/**
|
||||
* Number of instructions left to schedule that reference each vgrf.
|
||||
*
|
||||
|
|
@ -467,7 +471,8 @@ public:
|
|||
class fs_instruction_scheduler : public instruction_scheduler
|
||||
{
|
||||
public:
|
||||
fs_instruction_scheduler(fs_visitor *v, int grf_count, bool post_reg_alloc);
|
||||
fs_instruction_scheduler(fs_visitor *v, int grf_count,
|
||||
instruction_scheduler_mode mode);
|
||||
void calculate_deps();
|
||||
bool is_compressed(fs_inst *inst);
|
||||
schedule_node *choose_instruction_to_schedule();
|
||||
|
|
@ -481,8 +486,8 @@ public:
|
|||
|
||||
fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
|
||||
int grf_count,
|
||||
bool post_reg_alloc)
|
||||
: instruction_scheduler(v, grf_count, post_reg_alloc),
|
||||
instruction_scheduler_mode mode)
|
||||
: instruction_scheduler(v, grf_count, mode),
|
||||
v(v)
|
||||
{
|
||||
}
|
||||
|
|
@ -569,7 +574,7 @@ public:
|
|||
|
||||
vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
|
||||
int grf_count)
|
||||
: instruction_scheduler(v, grf_count, true),
|
||||
: instruction_scheduler(v, grf_count, SCHEDULE_POST),
|
||||
v(v)
|
||||
{
|
||||
}
|
||||
|
|
@ -1179,40 +1184,42 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
|
|||
continue;
|
||||
}
|
||||
|
||||
/* Prefer instructions that recently became available for scheduling.
|
||||
* These are the things that are most likely to (eventually) make a
|
||||
* variable dead and reduce register pressure. Typical register
|
||||
* pressure estimates don't work for us because most of our pressure
|
||||
* comes from texturing, where no single instruction to schedule will
|
||||
* make a vec4 value dead.
|
||||
*/
|
||||
if (n->cand_generation > chosen->cand_generation) {
|
||||
chosen = n;
|
||||
continue;
|
||||
} else if (n->cand_generation < chosen->cand_generation) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* On MRF-using chips, prefer non-SEND instructions. If we don't do
|
||||
* this, then because we prefer instructions that just became
|
||||
* candidates, we'll end up in a pattern of scheduling a SEND, then
|
||||
* the MRFs for the next SEND, then the next SEND, then the MRFs,
|
||||
* etc., without ever consuming the results of a send.
|
||||
*/
|
||||
if (v->brw->gen < 7) {
|
||||
fs_inst *chosen_inst = (fs_inst *)chosen->inst;
|
||||
|
||||
/* We use regs_written > 1 as our test for the kind of send
|
||||
* instruction to avoid -- only sends generate many regs, and a
|
||||
* single-result send is probably actually reducing register
|
||||
* pressure.
|
||||
if (mode == SCHEDULE_PRE_LIFO) {
|
||||
/* Prefer instructions that recently became available for
|
||||
* scheduling. These are the things that are most likely to
|
||||
* (eventually) make a variable dead and reduce register pressure.
|
||||
* Typical register pressure estimates don't work for us because
|
||||
* most of our pressure comes from texturing, where no single
|
||||
* instruction to schedule will make a vec4 value dead.
|
||||
*/
|
||||
if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) {
|
||||
if (n->cand_generation > chosen->cand_generation) {
|
||||
chosen = n;
|
||||
continue;
|
||||
} else if (inst->regs_written > chosen_inst->regs_written) {
|
||||
} else if (n->cand_generation < chosen->cand_generation) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* On MRF-using chips, prefer non-SEND instructions. If we don't
|
||||
* do this, then because we prefer instructions that just became
|
||||
* candidates, we'll end up in a pattern of scheduling a SEND,
|
||||
* then the MRFs for the next SEND, then the next SEND, then the
|
||||
* MRFs, etc., without ever consuming the results of a send.
|
||||
*/
|
||||
if (v->brw->gen < 7) {
|
||||
fs_inst *chosen_inst = (fs_inst *)chosen->inst;
|
||||
|
||||
/* We use regs_written > 1 as our test for the kind of send
|
||||
* instruction to avoid -- only sends generate many regs, and a
|
||||
* single-result send is probably actually reducing register
|
||||
* pressure.
|
||||
*/
|
||||
if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) {
|
||||
chosen = n;
|
||||
continue;
|
||||
} else if (inst->regs_written > chosen_inst->regs_written) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* For instructions pushed on the cands list at the same time, prefer
|
||||
|
|
@ -1407,18 +1414,18 @@ instruction_scheduler::run(exec_list *all_instructions)
|
|||
}
|
||||
|
||||
void
|
||||
fs_visitor::schedule_instructions(bool post_reg_alloc)
|
||||
fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
|
||||
{
|
||||
int grf_count;
|
||||
if (post_reg_alloc)
|
||||
if (mode == SCHEDULE_POST)
|
||||
grf_count = grf_used;
|
||||
else
|
||||
grf_count = virtual_grf_count;
|
||||
|
||||
fs_instruction_scheduler sched(this, grf_count, post_reg_alloc);
|
||||
fs_instruction_scheduler sched(this, grf_count, mode);
|
||||
sched.run(&instructions);
|
||||
|
||||
if (unlikely(INTEL_DEBUG & DEBUG_WM) && post_reg_alloc) {
|
||||
if (unlikely(INTEL_DEBUG & DEBUG_WM) && mode == SCHEDULE_POST) {
|
||||
printf("fs%d estimated execution time: %d cycles\n",
|
||||
dispatch_width, sched.time);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -59,6 +59,12 @@ public:
|
|||
bool predicate_inverse;
|
||||
};
|
||||
|
||||
enum instruction_scheduler_mode {
|
||||
SCHEDULE_PRE_NON_LIFO,
|
||||
SCHEDULE_PRE_LIFO,
|
||||
SCHEDULE_POST,
|
||||
};
|
||||
|
||||
class backend_visitor : public ir_visitor {
|
||||
public:
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue