i965/fs: Delay setup of uniform loads until after pre-regalloc scheduling.

This should fix the register allocation explosion on the GLES 3.0 test
on gen6.  It also gives us an instruction that will fit our CSE handling.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
NOTE: This is a candidate for the 9.1 branch.
This commit is contained in:
Eric Anholt 2013-02-15 19:26:48 -08:00
parent 49bdebad38
commit aebd3f46e3
3 changed files with 66 additions and 27 deletions

View file

@ -1710,8 +1710,6 @@ fs_visitor::setup_pull_constants()
dst, index, offset);
pull->ir = inst->ir;
pull->annotation = inst->annotation;
pull->base_mrf = 14;
pull->mlen = 1;
inst->insert_before(pull);
@ -2447,6 +2445,66 @@ fs_visitor::insert_gen4_send_dependency_workarounds()
}
}
/**
* Turns the generic expression-style uniform pull constant load instruction
* into a hardware-specific series of instructions for loading a pull
* constant.
*
* The expression style allows the CSE pass before this to optimize out
* repeated loads from the same offset, and gives the pre-register-allocation
* scheduling full flexibility, while the conversion to native instructions
* allows the post-register-allocation scheduler the best information
* possible.
*/
void
fs_visitor::lower_uniform_pull_constant_loads()
{
foreach_list(node, &this->instructions) {
fs_inst *inst = (fs_inst *)node;
if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
continue;
if (intel->gen >= 7) {
fs_reg const_offset_reg = inst->src[1];
assert(const_offset_reg.file == IMM &&
const_offset_reg.type == BRW_REGISTER_TYPE_UD);
const_offset_reg.imm.u /= 16;
fs_reg payload = fs_reg(this, glsl_type::uint_type);
struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
BRW_REGISTER_TYPE_UD);
fs_inst *setup1 = MOV(payload, fs_reg(g0));
setup1->force_writemask_all = true;
/* We don't need the second half of this vgrf to be filled with g1
* in the 16-wide case, but if we use force_uncompressed then live
* variable analysis won't consider this a def!
*/
fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
payload, payload,
const_offset_reg);
setup1->ir = inst->ir;
setup1->annotation = inst->annotation;
inst->insert_before(setup1);
setup2->ir = inst->ir;
setup2->annotation = inst->annotation;
inst->insert_before(setup2);
inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
inst->src[1] = payload;
} else {
/* Before register allocation, we didn't tell the scheduler about the
* MRF we use. We know it's safe to use this MRF because nothing
* else does except for register spill/unspill, which generates and
* uses its MRF within a single IR instruction.
*/
inst->base_mrf = 14;
inst->mlen = 1;
}
}
}
void
fs_visitor::dump_instruction(fs_inst *inst)
{
@ -2748,6 +2806,8 @@ fs_visitor::run()
schedule_instructions(false);
lower_uniform_pull_constant_loads();
assign_curb_setup();
assign_urb_setup();

View file

@ -334,6 +334,7 @@ public:
void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst);
void insert_gen4_post_send_dependency_workarounds(fs_inst *inst);
void fail(const char *msg, ...);
void lower_uniform_pull_constant_loads();
void push_force_uncompressed();
void pop_force_uncompressed();

View file

@ -597,31 +597,9 @@ fs_visitor::visit(ir_expression *ir)
fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
packed_consts.type = result.type;
if (intel->gen >= 7) {
fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] / 16);
fs_reg payload = fs_reg(this, glsl_type::uint_type);
struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
BRW_REGISTER_TYPE_UD);
fs_inst *setup = emit(MOV(payload, fs_reg(g0)));
setup->force_writemask_all = true;
/* We don't need the second half of this vgrf to be filled with g1
* in the 16-wide case, but if we use force_uncompressed then live
* variable analysis won't consider this a def!
*/
emit(FS_OPCODE_SET_GLOBAL_OFFSET, payload,
payload, const_offset_reg);
emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7, packed_consts,
surf_index, payload);
} else {
fs_reg const_offset_reg = fs_reg(const_offset->value.u[0]);
fs_inst *pull = emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
packed_consts,
surf_index,
const_offset_reg));
pull->base_mrf = 14;
pull->mlen = 1;
}
fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
packed_consts, surf_index, const_offset_reg));
packed_consts.smear = const_offset->value.u[0] % 16 / 4;
for (int i = 0; i < ir->type->vector_elements; i++) {