mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 02:20:11 +01:00
intel: Only set VectorMaskEnable when needed
For cases with lots of very small primitives, this may improve performance because we're not executing those dead channels all the time. Shader-db reports no instruction or cycle-count changes. However, by hacking up the driver to report when this optimization triggers, it appears to affect about 10% of shader-db. v2 (Kenneth Graunke): Always enable VMask prior to XeHP for now, because using VMask on those platforms allows us to perform the eliminate_find_live_channel() optimization. However, XeHP doesn't seem to have packed fragment shader dispatch, so we lose that optimization regardless, and there's no reason not to avoid vmask. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/1054>
This commit is contained in:
parent
7c1498daba
commit
dfedeccc13
6 changed files with 29 additions and 9 deletions
|
|
@ -6441,7 +6441,7 @@ crocus_upload_dirty_render_state(struct crocus_context *ice,
|
||||||
* incorrect for subspans where some of the pixels are unlit. We believe
|
* incorrect for subspans where some of the pixels are unlit. We believe
|
||||||
* the bit just didn't take effect in previous generations.
|
* the bit just didn't take effect in previous generations.
|
||||||
*/
|
*/
|
||||||
ps.VectorMaskEnable = GFX_VER >= 8;
|
ps.VectorMaskEnable = GFX_VER >= 8 && wm_prog_data->uses_vmask;
|
||||||
|
|
||||||
ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
|
ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
|
||||||
ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
|
ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
|
||||||
|
|
|
||||||
|
|
@ -4654,7 +4654,7 @@ iris_store_fs_state(const struct intel_device_info *devinfo,
|
||||||
uint32_t *psx_state = ps_state + GENX(3DSTATE_PS_length);
|
uint32_t *psx_state = ps_state + GENX(3DSTATE_PS_length);
|
||||||
|
|
||||||
iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
|
iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
|
||||||
ps.VectorMaskEnable = true;
|
ps.VectorMaskEnable = wm_prog_data->uses_vmask;
|
||||||
ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
|
ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
|
||||||
ps.FloatingPointMode = prog_data->use_alt_mode;
|
ps.FloatingPointMode = prog_data->use_alt_mode;
|
||||||
ps.MaximumNumberofThreadsPerPSD =
|
ps.MaximumNumberofThreadsPerPSD =
|
||||||
|
|
|
||||||
|
|
@ -887,6 +887,7 @@ struct brw_wm_prog_data {
|
||||||
bool uses_src_w;
|
bool uses_src_w;
|
||||||
bool uses_depth_w_coefficients;
|
bool uses_depth_w_coefficients;
|
||||||
bool uses_sample_mask;
|
bool uses_sample_mask;
|
||||||
|
bool uses_vmask;
|
||||||
bool has_render_target_reads;
|
bool has_render_target_reads;
|
||||||
bool has_side_effects;
|
bool has_side_effects;
|
||||||
bool pulls_bary;
|
bool pulls_bary;
|
||||||
|
|
@ -1967,7 +1968,9 @@ brw_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo,
|
||||||
*/
|
*/
|
||||||
const struct brw_wm_prog_data *wm_prog_data =
|
const struct brw_wm_prog_data *wm_prog_data =
|
||||||
(const struct brw_wm_prog_data *)prog_data;
|
(const struct brw_wm_prog_data *)prog_data;
|
||||||
return devinfo->verx10 < 125 && !wm_prog_data->persample_dispatch;
|
return devinfo->verx10 < 125 &&
|
||||||
|
!wm_prog_data->persample_dispatch &&
|
||||||
|
wm_prog_data->uses_vmask;
|
||||||
}
|
}
|
||||||
case MESA_SHADER_COMPUTE:
|
case MESA_SHADER_COMPUTE:
|
||||||
/* Compute shaders will be spawned with either a fully enabled dispatch
|
/* Compute shaders will be spawned with either a fully enabled dispatch
|
||||||
|
|
|
||||||
|
|
@ -9824,6 +9824,14 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
|
||||||
(prog_data->computed_depth_mode == BRW_PSCDEPTH_OFF) &&
|
(prog_data->computed_depth_mode == BRW_PSCDEPTH_OFF) &&
|
||||||
!prog_data->computed_stencil;
|
!prog_data->computed_stencil;
|
||||||
|
|
||||||
|
/* We choose to always enable VMask prior to XeHP, as it would cause
|
||||||
|
* us to lose out on the eliminate_find_live_channel() optimization.
|
||||||
|
*/
|
||||||
|
prog_data->uses_vmask = devinfo->verx10 < 125 ||
|
||||||
|
shader->info.fs.needs_quad_helper_invocations ||
|
||||||
|
shader->info.fs.needs_all_helper_invocations ||
|
||||||
|
prog_data->per_coarse_pixel_dispatch;
|
||||||
|
|
||||||
prog_data->uses_src_w =
|
prog_data->uses_src_w =
|
||||||
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
|
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
|
||||||
prog_data->uses_src_depth =
|
prog_data->uses_src_depth =
|
||||||
|
|
@ -10569,13 +10577,15 @@ static UNUSED void
|
||||||
brw_fs_test_dispatch_packing(const fs_builder &bld)
|
brw_fs_test_dispatch_packing(const fs_builder &bld)
|
||||||
{
|
{
|
||||||
const gl_shader_stage stage = bld.shader->stage;
|
const gl_shader_stage stage = bld.shader->stage;
|
||||||
|
const bool uses_vmask =
|
||||||
|
stage == MESA_SHADER_FRAGMENT &&
|
||||||
|
brw_wm_prog_data(bld.shader->stage_prog_data)->uses_vmask;
|
||||||
|
|
||||||
if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,
|
if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,
|
||||||
bld.shader->stage_prog_data)) {
|
bld.shader->stage_prog_data)) {
|
||||||
const fs_builder ubld = bld.exec_all().group(1, 0);
|
const fs_builder ubld = bld.exec_all().group(1, 0);
|
||||||
const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
|
const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
|
||||||
const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
|
const fs_reg mask = uses_vmask ? brw_vmask_reg() : brw_dmask_reg();
|
||||||
brw_dmask_reg());
|
|
||||||
|
|
||||||
ubld.ADD(tmp, mask, brw_imm_ud(1));
|
ubld.ADD(tmp, mask, brw_imm_ud(1));
|
||||||
ubld.AND(tmp, mask, tmp);
|
ubld.AND(tmp, mask, tmp);
|
||||||
|
|
|
||||||
|
|
@ -2411,21 +2411,27 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
|
||||||
|
const bool uses_vmask =
|
||||||
|
stage == MESA_SHADER_FRAGMENT &&
|
||||||
|
brw_wm_prog_data(this->prog_data)->uses_vmask;
|
||||||
const struct brw_reg mask =
|
const struct brw_reg mask =
|
||||||
brw_stage_has_packed_dispatch(devinfo, stage,
|
brw_stage_has_packed_dispatch(devinfo, stage,
|
||||||
prog_data) ? brw_imm_ud(~0u) :
|
prog_data) ? brw_imm_ud(~0u) :
|
||||||
stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
|
uses_vmask ? brw_vmask_reg() : brw_dmask_reg();
|
||||||
brw_dmask_reg();
|
|
||||||
|
|
||||||
brw_find_live_channel(p, dst, mask, false);
|
brw_find_live_channel(p, dst, mask, false);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: {
|
case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: {
|
||||||
|
const bool uses_vmask =
|
||||||
|
stage == MESA_SHADER_FRAGMENT &&
|
||||||
|
brw_wm_prog_data(this->prog_data)->uses_vmask;
|
||||||
|
|
||||||
/* ce0 doesn't consider the thread dispatch mask, so if we want
|
/* ce0 doesn't consider the thread dispatch mask, so if we want
|
||||||
* to find the true last enabled channel, we need to apply that too.
|
* to find the true last enabled channel, we need to apply that too.
|
||||||
*/
|
*/
|
||||||
const struct brw_reg mask =
|
const struct brw_reg mask =
|
||||||
stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() : brw_dmask_reg();
|
uses_vmask ? brw_vmask_reg() : brw_dmask_reg();
|
||||||
|
|
||||||
brw_find_live_channel(p, dst, mask, true);
|
brw_find_live_channel(p, dst, mask, true);
|
||||||
break;
|
break;
|
||||||
|
|
|
||||||
|
|
@ -2351,7 +2351,8 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
|
||||||
brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
|
brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
|
||||||
|
|
||||||
ps.SingleProgramFlow = false;
|
ps.SingleProgramFlow = false;
|
||||||
ps.VectorMaskEnable = GFX_VER >= 8;
|
ps.VectorMaskEnable = GFX_VER >= 8 &&
|
||||||
|
wm_prog_data->uses_vmask;
|
||||||
/* Wa_1606682166 */
|
/* Wa_1606682166 */
|
||||||
ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);
|
ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);
|
||||||
ps.BindingTableEntryCount = fs_bin->bind_map.surface_count;
|
ps.BindingTableEntryCount = fs_bin->bind_map.surface_count;
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue