pan/va: Support relaxed waits on read-only render targets

On Valhall we can optimize lower waits, which waits for both readers and writers, into resource_waits which only wait for writers, allowing threads accessing read-only resources to execute concurrently. Let's use that on LD_TILE instructions so we can optmize the read-only case. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32540>
2026-05-05 16:08:04 +02:00 · 2025-03-07 15:39:02 +01:00 · 2025-03-07 15:39:02 +01:00 · 4f4ac56145
commit 4f4ac56145
parent fbb2805575
6 changed files with 58 additions and 2 deletions
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -1554,6 +1554,13 @@ load("sampler_lod_parameters_pan", [1], flags=[CAN_ELIMINATE, CAN_REORDER])
 # and is ignored otherwise
 load("converted_output_pan", [1, 1, 1], indices=[DEST_TYPE, IO_SEMANTICS], flags=[CAN_ELIMINATE])

+# Like converted_output_pan but for case where the output is never written by the shader
+# This is used to relax waits on tile-buffer accesses and the target is read-only
+# src[] = { target, sample, conversion }
+# target must be in the [0..7] range when io_semantics.location is FRAG_RESULT_DATA0
+# and is ignored otherwise
+load("readonly_output_pan", [1, 1, 1], indices=[DEST_TYPE, IO_SEMANTICS], flags=[CAN_ELIMINATE])
+
 # Load input attachment target
 # src[] = { input_attachment_index }
 # valid targets are:
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@ -1841,6 +1841,9 @@ bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
   if (is_zs)
      I->z_stencil = true;

+   if (instr->intrinsic == nir_intrinsic_load_readonly_output_pan)
+      I->wait_resource = true;
+
   bi_emit_cached_split(b, dest, size * nr);
 }

@ -2103,6 +2106,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
      break;

   case nir_intrinsic_load_converted_output_pan:
+   case nir_intrinsic_load_readonly_output_pan:
      bi_emit_ld_tile(b, instr);
      break;

--- a/src/panfrost/compiler/compiler.h
+++ b/src/panfrost/compiler/compiler.h
@ -402,6 +402,9 @@ typedef struct {
   /* Flow control associated with a Valhall instruction */
   uint8_t flow;

+   /* Valhall-only property to relax waits on read-only resources */
+   bool wait_resource;
+
   /* Slot associated with a message-passing instruction */
   uint8_t slot;

--- a/src/panfrost/compiler/valhall/ISA.xml
+++ b/src/panfrost/compiler/valhall/ISA.xml
@ -95,7 +95,7 @@
    <value name="Wait on slots 0, 1, 2, 6, 7">wait</value>
    <value name="Perform branch reconverge">reconverge</value>
    <reserved/>
-    <reserved/>
+    <value name="Wait for resource read">wait_resource</value>
    <value name="Terminate discarded threads">discard</value>
    <reserved/>
    <value name="Return from shader">end</value>
--- a/src/panfrost/compiler/valhall/va_insert_flow.c
+++ b/src/panfrost/compiler/valhall/va_insert_flow.c
@ -21,6 +21,8 @@
 * SOFTWARE.
 */

+#include "panfrost/lib/pan_props.h"
+
 #include "bi_builder.h"
 #include "va_compiler.h"
 #include "valhall_enums.h"
@ -441,11 +443,21 @@ va_insert_flow_control_nops(bi_context *ctx)
          * instructions. Wait for slot 0 immediately after the ATEST.
          */
         case BI_OPCODE_BLEND:
-         case BI_OPCODE_LD_TILE:
         case BI_OPCODE_ST_TILE:
            if (!ctx->inputs->is_blend)
               bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT);
            break;
+
+         case BI_OPCODE_LD_TILE: {
+            if (ctx->inputs->is_blend)
+               break;
+
+            assert(!I->wait_resource || pan_arch(ctx->inputs->gpu_id) >= 10);
+            bi_flow(ctx, bi_before_instr(I),
+                    I->wait_resource ? VA_FLOW_WAIT_RESOURCE : VA_FLOW_WAIT);
+            break;
+	 }
+
         case BI_OPCODE_ATEST:
            bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126);
            bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT0);
--- a/src/panfrost/compiler/valhall/va_merge_flow.c
+++ b/src/panfrost/compiler/valhall/va_merge_flow.c
@ -161,6 +161,35 @@ merge_waits(bi_block *block)
   }
 }

+static void
+merge_resource_waits(bi_block *block)
+{
+   /* Most recent instruction with which we can merge, or NULL if none */
+   bi_instr *last_free = NULL;
+
+   bi_foreach_instr_in_block_safe(block, I) {
+      if (last_free != NULL && I->op == BI_OPCODE_NOP &&
+          I->flow == VA_FLOW_WAIT_RESOURCE) {
+
+         /* Merge resource_waits with compatible instructions */
+         last_free->flow = VA_FLOW_WAIT_RESOURCE;
+         bi_remove_instruction(I);
+         continue;
+      }
+
+      /* Don't move waits past async instructions, since they might be what
+       * we're waiting for. If we wanted to optimize this case, we could check
+       * the signaled slots.
+       */
+      if (bi_get_opcode_props(I)->message)
+         last_free = NULL;
+
+      /* We can only merge with instructions whose flow control is a none. */
+      if (I->flow == VA_FLOW_NONE)
+         last_free = I;
+   }
+}
+
 static bool
 bi_is_first_instr(bi_block *block, bi_instr *I)
 {
@ -221,6 +250,7 @@ va_merge_flow(bi_context *ctx)
         continue;

      merge_end_reconverge(block);
+      merge_resource_waits(block);
      merge_waits(block);

      if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend)