intel: Use Morton compute walk order

According to HSD 14016252163 if compute shader uses the sample operation, morton walk order and set the thread group batch size to 4 is expected to increase sampler cache hit rates by increasing sample address locality within a subslice. Rework: * Caio: "||" => "&&" for type checking in instr_uses_sampler() * Jordan: Use nir's foreach macros rather than nir_shader_lower_instructions() Signed-off-by: Sagar Ghuge <sagar.ghuge@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Rohan Garg <rohan.garg@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32430>
2026-05-05 05:18:08 +02:00 · 2023-08-09 17:15:11 -07:00 · 2023-08-09 17:15:11 -07:00 · d3f9139e49
commit d3f9139e49
parent 4bd958243d
6 changed files with 60 additions and 0 deletions
--- a/src/gallium/drivers/iris/iris_context.h
+++ b/src/gallium/drivers/iris/iris_context.h
@ -379,6 +379,7 @@ struct iris_cs_data {
   enum intel_compute_walk_order walk_order;

   bool uses_barrier;
+   bool uses_sampler;
   bool first_param_is_builtin_subgroup_id;
 };

--- a/src/gallium/drivers/iris/iris_program.c
+++ b/src/gallium/drivers/iris/iris_program.c
@ -150,6 +150,7 @@ iris_apply_brw_cs_prog_data(struct iris_compiled_shader *shader,
   iris->generate_local_id = brw->generate_local_id;
   iris->walk_order        = brw->walk_order;
   iris->uses_barrier      = brw->uses_barrier;
+   iris->uses_sampler      = brw->uses_sampler;
   iris->prog_mask         = brw->prog_mask;

   iris->first_param_is_builtin_subgroup_id =
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@ -8941,6 +8941,16 @@ static void iris_emit_execute_indirect_dispatch(struct iris_context *ice,
   body.ExecutionMask       = dispatch.right_mask;
   body.PostSync.MOCS       = iris_mocs(NULL, &screen->isl_dev, 0);
   body.InterfaceDescriptor = idd;
+   /* HSD 14016252163: Use of Morton walk order (and batching using a batch
+    * size of 4) is expected to increase sampler cache hit rates by
+    * increasing sample address locality within a subslice.
+    */
+#if GFX_VER >= 30
+   body.DispatchWalkOrder =
+      cs_data->uses_sampler ? MortonWalk : LinearWalk;
+   body.ThreadGroupBatchSize =
+      cs_data->uses_sampler ? TG_BATCH_4 : TG_BATCH_1;
+#endif

   struct iris_address indirect_bo = ro_bo(indirect, grid->indirect_offset);
   iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DISPATCH), ind) {
--- a/src/intel/compiler/brw_compile_cs.cpp
+++ b/src/intel/compiler/brw_compile_cs.cpp
@ -98,6 +98,38 @@ run_cs(fs_visitor &s, bool allow_spilling)
   return !s.failed;
 }

+static bool
+instr_uses_sampler(nir_builder *b, nir_instr *instr, void *cb_data)
+{
+   if (instr->type != nir_instr_type_tex)
+      return false;
+
+   switch (nir_instr_as_tex(instr)->op) {
+   case nir_texop_tex:
+   case nir_texop_txd:
+   case nir_texop_txf:
+   case nir_texop_txl:
+   case nir_texop_txb:
+   case nir_texop_txf_ms:
+   case nir_texop_txf_ms_mcs_intel:
+   case nir_texop_lod:
+   case nir_texop_tg4:
+   case nir_texop_texture_samples:
+      return true;
+
+   default:
+      return false;
+   }
+}
+
+static bool
+brw_nir_uses_sampler(nir_shader *shader)
+{
+   return nir_shader_instructions_pass(shader, instr_uses_sampler,
+                                       nir_metadata_all,
+                                       NULL);
+}
+
 const unsigned *
 brw_compile_cs(const struct brw_compiler *compiler,
               struct brw_compile_cs_params *params)
@ -129,6 +161,8 @@ brw_compile_cs(const struct brw_compiler *compiler,
      .required_width = brw_required_dispatch_width(&nir->info),
   };

+   prog_data->uses_sampler = brw_nir_uses_sampler(params->base.nir);
+
   std::unique_ptr<fs_visitor> v[3];

   for (unsigned simd = 0; simd < 3; simd++) {
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@ -881,6 +881,9 @@ struct brw_cs_prog_data {
   uint8_t generate_local_id;
   enum intel_compute_walk_order walk_order;

+   /* True if shader has any sample operation */
+   bool uses_sampler;
+
   struct {
      struct brw_push_const_block cross_thread;
      struct brw_push_const_block per_thread;
--- a/src/intel/vulkan/genX_cmd_compute.c
+++ b/src/intel/vulkan/genX_cmd_compute.c
@ -390,6 +390,17 @@ emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,

   struct GENX(COMPUTE_WALKER_BODY) body =  {
      .SIMDSize                 = dispatch_size,
+      /* HSD 14016252163: Use of Morton walk order (and batching using a batch
+       * size of 4) is expected to increase sampler cache hit rates by
+       * increasing sample address locality within a subslice.
+       */
+#if GFX_VER >= 30
+      .DispatchWalkOrder        = prog_data->uses_sampler ?
+                                  MortonWalk :
+                                  LinearWalk,
+      .ThreadGroupBatchSize     = prog_data->uses_sampler ? TG_BATCH_4 :
+                                                            TG_BATCH_1,
+#endif
      .MessageSIMD              = dispatch_size,
      .IndirectDataStartAddress = comp_state->base.push_constants_state.offset,
      .IndirectDataLength       = comp_state->base.push_constants_state.alloc_size,