mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-03 20:10:17 +01:00
i965/fs: Delay setup of uniform loads until after pre-regalloc scheduling.
This should fix the register allocation explosion on the GLES 3.0 test on gen6. It also gives us an instruction that will fit our CSE handling. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> NOTE: This is a candidate for the 9.1 branch.
This commit is contained in:
parent
49bdebad38
commit
aebd3f46e3
3 changed files with 66 additions and 27 deletions
|
|
@ -1710,8 +1710,6 @@ fs_visitor::setup_pull_constants()
|
|||
dst, index, offset);
|
||||
pull->ir = inst->ir;
|
||||
pull->annotation = inst->annotation;
|
||||
pull->base_mrf = 14;
|
||||
pull->mlen = 1;
|
||||
|
||||
inst->insert_before(pull);
|
||||
|
||||
|
|
@ -2447,6 +2445,66 @@ fs_visitor::insert_gen4_send_dependency_workarounds()
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Turns the generic expression-style uniform pull constant load instruction
|
||||
* into a hardware-specific series of instructions for loading a pull
|
||||
* constant.
|
||||
*
|
||||
* The expression style allows the CSE pass before this to optimize out
|
||||
* repeated loads from the same offset, and gives the pre-register-allocation
|
||||
* scheduling full flexibility, while the conversion to native instructions
|
||||
* allows the post-register-allocation scheduler the best information
|
||||
* possible.
|
||||
*/
|
||||
void
|
||||
fs_visitor::lower_uniform_pull_constant_loads()
|
||||
{
|
||||
foreach_list(node, &this->instructions) {
|
||||
fs_inst *inst = (fs_inst *)node;
|
||||
|
||||
if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
|
||||
continue;
|
||||
|
||||
if (intel->gen >= 7) {
|
||||
fs_reg const_offset_reg = inst->src[1];
|
||||
assert(const_offset_reg.file == IMM &&
|
||||
const_offset_reg.type == BRW_REGISTER_TYPE_UD);
|
||||
const_offset_reg.imm.u /= 16;
|
||||
fs_reg payload = fs_reg(this, glsl_type::uint_type);
|
||||
struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
|
||||
BRW_REGISTER_TYPE_UD);
|
||||
|
||||
fs_inst *setup1 = MOV(payload, fs_reg(g0));
|
||||
setup1->force_writemask_all = true;
|
||||
/* We don't need the second half of this vgrf to be filled with g1
|
||||
* in the 16-wide case, but if we use force_uncompressed then live
|
||||
* variable analysis won't consider this a def!
|
||||
*/
|
||||
|
||||
fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
|
||||
payload, payload,
|
||||
const_offset_reg);
|
||||
|
||||
setup1->ir = inst->ir;
|
||||
setup1->annotation = inst->annotation;
|
||||
inst->insert_before(setup1);
|
||||
setup2->ir = inst->ir;
|
||||
setup2->annotation = inst->annotation;
|
||||
inst->insert_before(setup2);
|
||||
inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
|
||||
inst->src[1] = payload;
|
||||
} else {
|
||||
/* Before register allocation, we didn't tell the scheduler about the
|
||||
* MRF we use. We know it's safe to use this MRF because nothing
|
||||
* else does except for register spill/unspill, which generates and
|
||||
* uses its MRF within a single IR instruction.
|
||||
*/
|
||||
inst->base_mrf = 14;
|
||||
inst->mlen = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::dump_instruction(fs_inst *inst)
|
||||
{
|
||||
|
|
@ -2748,6 +2806,8 @@ fs_visitor::run()
|
|||
|
||||
schedule_instructions(false);
|
||||
|
||||
lower_uniform_pull_constant_loads();
|
||||
|
||||
assign_curb_setup();
|
||||
assign_urb_setup();
|
||||
|
||||
|
|
|
|||
|
|
@ -334,6 +334,7 @@ public:
|
|||
void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst);
|
||||
void insert_gen4_post_send_dependency_workarounds(fs_inst *inst);
|
||||
void fail(const char *msg, ...);
|
||||
void lower_uniform_pull_constant_loads();
|
||||
|
||||
void push_force_uncompressed();
|
||||
void pop_force_uncompressed();
|
||||
|
|
|
|||
|
|
@ -597,31 +597,9 @@ fs_visitor::visit(ir_expression *ir)
|
|||
fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
|
||||
packed_consts.type = result.type;
|
||||
|
||||
if (intel->gen >= 7) {
|
||||
fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] / 16);
|
||||
fs_reg payload = fs_reg(this, glsl_type::uint_type);
|
||||
struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
|
||||
BRW_REGISTER_TYPE_UD);
|
||||
fs_inst *setup = emit(MOV(payload, fs_reg(g0)));
|
||||
setup->force_writemask_all = true;
|
||||
/* We don't need the second half of this vgrf to be filled with g1
|
||||
* in the 16-wide case, but if we use force_uncompressed then live
|
||||
* variable analysis won't consider this a def!
|
||||
*/
|
||||
|
||||
emit(FS_OPCODE_SET_GLOBAL_OFFSET, payload,
|
||||
payload, const_offset_reg);
|
||||
emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7, packed_consts,
|
||||
surf_index, payload);
|
||||
} else {
|
||||
fs_reg const_offset_reg = fs_reg(const_offset->value.u[0]);
|
||||
fs_inst *pull = emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
|
||||
packed_consts,
|
||||
surf_index,
|
||||
const_offset_reg));
|
||||
pull->base_mrf = 14;
|
||||
pull->mlen = 1;
|
||||
}
|
||||
fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
|
||||
emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
|
||||
packed_consts, surf_index, const_offset_reg));
|
||||
|
||||
packed_consts.smear = const_offset->value.u[0] % 16 / 4;
|
||||
for (int i = 0; i < ir->type->vector_elements; i++) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue