i965/fs: Use LD messages for pre-gen7 varying-index uniform loads

This comes at a minor performance cost at the moment (-3.2% +/- 0.2%, n=14 on
my GM45 forced to load all uniforms through the varying-index path), but we
get a whole vec4 at a time to reuse in the next commit.

v2: Fix comment about channels in the other message.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
NOTE: This is a candidate for the 9.1 branch.
This commit is contained in:
Eric Anholt 2013-03-18 10:16:42 -07:00
parent ce316f62ef
commit 70b27e0e4b
4 changed files with 85 additions and 68 deletions

View file

@ -238,57 +238,53 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
exec_list instructions;
fs_inst *inst;
if (intel->gen >= 7) {
/* We have our constant surface use a pitch of 4 bytes, so our index can
* be any component of a vector, and then we load 4 contiguous
* components starting from that.
*
* We break down the const_offset to a portion added to the variable
* offset and a portion done using reg_offset, which means that if you
* have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
* a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
* CSE can later notice that those loads are all the same and eliminate
* the redundant ones.
/* We have our constant surface use a pitch of 4 bytes, so our index can
* be any component of a vector, and then we load 4 contiguous
* components starting from that.
*
* We break down the const_offset to a portion added to the variable
* offset and a portion done using reg_offset, which means that if you
* have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
* a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
* CSE can later notice that those loads are all the same and eliminate
* the redundant ones.
*/
fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
instructions.push_tail(ADD(vec4_offset,
varying_offset, const_offset & ~3));
int scale = 1;
if (intel->gen == 4 && dispatch_width == 8) {
/* Pre-gen5, we can either use a SIMD8 message that requires (header,
* u, v, r) as parameters, or we can just use the SIMD16 message
* consisting of (header, u). We choose the second, at the cost of a
* longer return length.
*/
fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
instructions.push_tail(ADD(vec4_offset,
varying_offset, const_offset & ~3));
fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4), dst.type);
inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
vec4_result, surf_index, vec4_offset);
inst->regs_written = 4;
instructions.push_tail(inst);
vec4_result.reg_offset += const_offset & 3;
instructions.push_tail(MOV(dst, vec4_result));
} else {
fs_reg offset = fs_reg(this, glsl_type::uint_type);
instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
int base_mrf = 13;
bool header_present = true;
fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
mrf.type = BRW_REGISTER_TYPE_D;
/* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
* dword-aligned byte offset.
*/
if (intel->gen == 6) {
instructions.push_tail(MOV(mrf, offset));
} else {
instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
}
inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
dst, surf_index);
inst->header_present = header_present;
inst->base_mrf = base_mrf;
inst->mlen = header_present + dispatch_width / 8;
instructions.push_tail(inst);
scale = 2;
}
enum opcode op;
if (intel->gen >= 7)
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
else
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
inst->regs_written = 4 * scale;
instructions.push_tail(inst);
if (intel->gen < 7) {
inst->base_mrf = 13;
inst->header_present = true;
if (intel->gen == 4)
inst->mlen = 3;
else
inst->mlen = 1 + dispatch_width / 8;
}
vec4_result.reg_offset += (const_offset & 3) * scale;
instructions.push_tail(MOV(dst, vec4_result));
return instructions;
}
@ -754,7 +750,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
case FS_OPCODE_UNSPILL:
return 1;
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
return inst->header_present;
return inst->mlen;
case FS_OPCODE_SPILL:
return 2;
default:

View file

@ -541,7 +541,8 @@ private:
struct brw_reg surf_index,
struct brw_reg offset);
void generate_varying_pull_constant_load(fs_inst *inst, struct brw_reg dst,
struct brw_reg index);
struct brw_reg index,
struct brw_reg offset);
void generate_varying_pull_constant_load_gen7(fs_inst *inst,
struct brw_reg dst,
struct brw_reg index,

View file

@ -69,6 +69,7 @@ is_expression(const fs_inst *const inst)
case BRW_OPCODE_LRP:
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
case FS_OPCODE_CINTERP:
case FS_OPCODE_LINTERP:
return true;

View file

@ -674,47 +674,66 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
void
fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
struct brw_reg dst,
struct brw_reg index)
struct brw_reg index,
struct brw_reg offset)
{
assert(intel->gen < 7); /* Should use the gen7 variant. */
assert(inst->header_present);
assert(inst->mlen);
assert(index.file == BRW_IMMEDIATE_VALUE &&
index.type == BRW_REGISTER_TYPE_UD);
uint32_t surf_index = index.dw1.ud;
uint32_t msg_type, msg_control, rlen;
if (intel->gen >= 6)
msg_type = GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ;
else if (intel->gen == 5 || intel->is_g4x)
msg_type = G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ;
else
msg_type = BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ;
uint32_t simd_mode, rlen, msg_type;
if (dispatch_width == 16) {
msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS;
rlen = 2;
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
rlen = 8;
} else {
msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS;
rlen = 1;
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
rlen = 4;
}
if (intel->gen >= 5)
msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
else {
/* We always use the SIMD16 message so that we only have to load U, and
* not V or R.
*/
msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
assert(inst->mlen == 3);
assert(inst->regs_written == 8);
rlen = 8;
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
}
struct brw_reg offset_mrf = retype(brw_message_reg(inst->base_mrf + 1),
BRW_REGISTER_TYPE_D);
brw_MOV(p, offset_mrf, offset);
struct brw_reg header = brw_vec8_grf(0, 0);
gen6_resolve_implied_move(p, &header, inst->base_mrf);
struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
send->header.compression_control = BRW_COMPRESSION_NONE;
brw_set_dest(p, send, dst);
brw_set_src0(p, send, header);
if (intel->gen < 6)
send->header.destreg__conditionalmod = inst->base_mrf;
brw_set_dp_read_message(p, send,
/* Our surface is set up as floats, regardless of what actual data is
* stored in it.
*/
uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
brw_set_sampler_message(p, send,
surf_index,
msg_control,
0, /* sampler (unused) */
msg_type,
BRW_DATAPORT_READ_TARGET_DATA_CACHE,
rlen,
inst->mlen,
inst->header_present,
rlen);
simd_mode,
return_format);
}
void
@ -1305,7 +1324,7 @@ fs_generator::generate_code(exec_list *instructions)
break;
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
generate_varying_pull_constant_load(inst, dst, src[0]);
generate_varying_pull_constant_load(inst, dst, src[0], src[1]);
break;
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7: