intel: Use Morton compute walk order

According to HSD 14016252163 if compute shader uses the sample
operation, morton walk order and set the thread group batch size to 4 is
expected to increase sampler cache hit rates by increasing sample
address locality within a subslice.

Rework:
 * Caio: "||" => "&&" for type checking in instr_uses_sampler()
 * Jordan: Use nir's foreach macros rather than
   nir_shader_lower_instructions()

Signed-off-by: Sagar Ghuge <sagar.ghuge@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32430>
This commit is contained in:
Sagar Ghuge 2023-08-09 17:15:11 -07:00
parent 4bd958243d
commit d3f9139e49
6 changed files with 60 additions and 0 deletions

View file

@ -379,6 +379,7 @@ struct iris_cs_data {
enum intel_compute_walk_order walk_order;
bool uses_barrier;
bool uses_sampler;
bool first_param_is_builtin_subgroup_id;
};

View file

@ -150,6 +150,7 @@ iris_apply_brw_cs_prog_data(struct iris_compiled_shader *shader,
iris->generate_local_id = brw->generate_local_id;
iris->walk_order = brw->walk_order;
iris->uses_barrier = brw->uses_barrier;
iris->uses_sampler = brw->uses_sampler;
iris->prog_mask = brw->prog_mask;
iris->first_param_is_builtin_subgroup_id =

View file

@ -8941,6 +8941,16 @@ static void iris_emit_execute_indirect_dispatch(struct iris_context *ice,
body.ExecutionMask = dispatch.right_mask;
body.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
body.InterfaceDescriptor = idd;
/* HSD 14016252163: Use of Morton walk order (and batching using a batch
* size of 4) is expected to increase sampler cache hit rates by
* increasing sample address locality within a subslice.
*/
#if GFX_VER >= 30
body.DispatchWalkOrder =
cs_data->uses_sampler ? MortonWalk : LinearWalk;
body.ThreadGroupBatchSize =
cs_data->uses_sampler ? TG_BATCH_4 : TG_BATCH_1;
#endif
struct iris_address indirect_bo = ro_bo(indirect, grid->indirect_offset);
iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DISPATCH), ind) {

View file

@ -98,6 +98,38 @@ run_cs(fs_visitor &s, bool allow_spilling)
return !s.failed;
}
static bool
instr_uses_sampler(nir_builder *b, nir_instr *instr, void *cb_data)
{
if (instr->type != nir_instr_type_tex)
return false;
switch (nir_instr_as_tex(instr)->op) {
case nir_texop_tex:
case nir_texop_txd:
case nir_texop_txf:
case nir_texop_txl:
case nir_texop_txb:
case nir_texop_txf_ms:
case nir_texop_txf_ms_mcs_intel:
case nir_texop_lod:
case nir_texop_tg4:
case nir_texop_texture_samples:
return true;
default:
return false;
}
}
static bool
brw_nir_uses_sampler(nir_shader *shader)
{
return nir_shader_instructions_pass(shader, instr_uses_sampler,
nir_metadata_all,
NULL);
}
const unsigned *
brw_compile_cs(const struct brw_compiler *compiler,
struct brw_compile_cs_params *params)
@ -129,6 +161,8 @@ brw_compile_cs(const struct brw_compiler *compiler,
.required_width = brw_required_dispatch_width(&nir->info),
};
prog_data->uses_sampler = brw_nir_uses_sampler(params->base.nir);
std::unique_ptr<fs_visitor> v[3];
for (unsigned simd = 0; simd < 3; simd++) {

View file

@ -881,6 +881,9 @@ struct brw_cs_prog_data {
uint8_t generate_local_id;
enum intel_compute_walk_order walk_order;
/* True if shader has any sample operation */
bool uses_sampler;
struct {
struct brw_push_const_block cross_thread;
struct brw_push_const_block per_thread;

View file

@ -390,6 +390,17 @@ emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
struct GENX(COMPUTE_WALKER_BODY) body = {
.SIMDSize = dispatch_size,
/* HSD 14016252163: Use of Morton walk order (and batching using a batch
* size of 4) is expected to increase sampler cache hit rates by
* increasing sample address locality within a subslice.
*/
#if GFX_VER >= 30
.DispatchWalkOrder = prog_data->uses_sampler ?
MortonWalk :
LinearWalk,
.ThreadGroupBatchSize = prog_data->uses_sampler ? TG_BATCH_4 :
TG_BATCH_1,
#endif
.MessageSIMD = dispatch_size,
.IndirectDataStartAddress = comp_state->base.push_constants_state.offset,
.IndirectDataLength = comp_state->base.push_constants_state.alloc_size,