mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-23 15:30:14 +01:00
i965/fs: Combine pixel center calculation into one inst.
The X and Y values come interleaved in g1 (.4-.11 inclusive), so we can calculate them together with a single add(32) instruction on some platforms like Broadwell and newer or in SIMD8 elsewhere. Note that I also moved the PIXEL_X/PIXEL_Y virtual opcodes from before LINTERP to after it. That's because the writes_accumulator_implicitly() function in backend_instruction tests for <= LINTERP for determining whether the instruction indeed writes the accumulator implicitly. The old FS_OPCODE_PIXEL_X/Y emitted ADD instructions, which did, but the new opcodes just emit MOVs, which don't. It doesn't matter, since we don't use these opcodes on Gen4/5 anymore, but in the case that we do... On Broadwell: total instructions in shared programs: 7192355 -> 7186224 (-0.09%) instructions in affected programs: 1190700 -> 1184569 (-0.51%) helped: 6131 On Haswell: total instructions in shared programs: 6155979 -> 6152800 (-0.05%) instructions in affected programs: 652362 -> 649183 (-0.49%) helped: 3179 Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
This commit is contained in:
parent
5af0604d52
commit
529064f6a8
3 changed files with 62 additions and 19 deletions
|
|
@ -925,6 +925,8 @@ enum opcode {
|
|||
FS_OPCODE_DDY_FINE,
|
||||
FS_OPCODE_CINTERP,
|
||||
FS_OPCODE_LINTERP,
|
||||
FS_OPCODE_PIXEL_X,
|
||||
FS_OPCODE_PIXEL_Y,
|
||||
FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
|
||||
FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7,
|
||||
FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
|
||||
|
|
|
|||
|
|
@ -1940,6 +1940,16 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
|
|||
case FS_OPCODE_LINTERP:
|
||||
generate_linterp(inst, dst, src);
|
||||
break;
|
||||
case FS_OPCODE_PIXEL_X:
|
||||
assert(src[0].type == BRW_REGISTER_TYPE_UW);
|
||||
src[0].subnr = 0 * type_sz(src[0].type);
|
||||
brw_MOV(p, dst, stride(src[0], 8, 4, 1));
|
||||
break;
|
||||
case FS_OPCODE_PIXEL_Y:
|
||||
assert(src[0].type == BRW_REGISTER_TYPE_UW);
|
||||
src[0].subnr = 4 * type_sz(src[0].type);
|
||||
brw_MOV(p, dst, stride(src[0], 8, 4, 1));
|
||||
break;
|
||||
case SHADER_OPCODE_TEX:
|
||||
case FS_OPCODE_TXB:
|
||||
case SHADER_OPCODE_TXD:
|
||||
|
|
|
|||
|
|
@ -3478,27 +3478,58 @@ fs_visitor::emit_interpolation_setup_gen6()
|
|||
{
|
||||
struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
|
||||
|
||||
/* If the pixel centers end up used, the setup is the same as for gen4. */
|
||||
this->current_annotation = "compute pixel centers";
|
||||
fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
|
||||
fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
|
||||
int_pixel_x.type = BRW_REGISTER_TYPE_UW;
|
||||
int_pixel_y.type = BRW_REGISTER_TYPE_UW;
|
||||
emit(ADD(int_pixel_x,
|
||||
fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
|
||||
fs_reg(brw_imm_v(0x10101010))));
|
||||
emit(ADD(int_pixel_y,
|
||||
fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
|
||||
fs_reg(brw_imm_v(0x11001100))));
|
||||
if (brw->gen >= 8 || dispatch_width == 8) {
|
||||
/* The "Register Region Restrictions" page says for BDW (and newer,
|
||||
* presumably):
|
||||
*
|
||||
* "When destination spans two registers, the source may be one or
|
||||
* two registers. The destination elements must be evenly split
|
||||
* between the two registers."
|
||||
*
|
||||
* Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to
|
||||
* compute our pixel centers.
|
||||
*/
|
||||
fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
|
||||
BRW_REGISTER_TYPE_UW, dispatch_width * 2);
|
||||
emit(ADD(int_pixel_xy,
|
||||
fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
|
||||
fs_reg(brw_imm_v(0x11001010))))
|
||||
->force_writemask_all = true;
|
||||
|
||||
/* As of gen6, we can no longer mix float and int sources. We have
|
||||
* to turn the integer pixel centers into floats for their actual
|
||||
* use.
|
||||
*/
|
||||
this->pixel_x = vgrf(glsl_type::float_type);
|
||||
this->pixel_y = vgrf(glsl_type::float_type);
|
||||
emit(MOV(this->pixel_x, int_pixel_x));
|
||||
emit(MOV(this->pixel_y, int_pixel_y));
|
||||
this->pixel_x = vgrf(glsl_type::float_type);
|
||||
this->pixel_y = vgrf(glsl_type::float_type);
|
||||
emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
|
||||
emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
|
||||
} else {
|
||||
/* The "Register Region Restrictions" page says for SNB, IVB, HSW:
|
||||
*
|
||||
* "When destination spans two registers, the source MUST span two
|
||||
* registers."
|
||||
*
|
||||
* Since the GRF source of the ADD will only read a single register, we
|
||||
* must do two separate ADDs in SIMD16.
|
||||
*/
|
||||
fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
|
||||
fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
|
||||
int_pixel_x.type = BRW_REGISTER_TYPE_UW;
|
||||
int_pixel_y.type = BRW_REGISTER_TYPE_UW;
|
||||
emit(ADD(int_pixel_x,
|
||||
fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
|
||||
fs_reg(brw_imm_v(0x10101010))));
|
||||
emit(ADD(int_pixel_y,
|
||||
fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
|
||||
fs_reg(brw_imm_v(0x11001100))));
|
||||
|
||||
/* As of gen6, we can no longer mix float and int sources. We have
|
||||
* to turn the integer pixel centers into floats for their actual
|
||||
* use.
|
||||
*/
|
||||
this->pixel_x = vgrf(glsl_type::float_type);
|
||||
this->pixel_y = vgrf(glsl_type::float_type);
|
||||
emit(MOV(this->pixel_x, int_pixel_x));
|
||||
emit(MOV(this->pixel_y, int_pixel_y));
|
||||
}
|
||||
|
||||
this->current_annotation = "compute pos.w";
|
||||
this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue