jay: rewrite demote/terminate/helper/halt handling

* implement terminate * fix HALT brokenness on all shader stages (we need a real end block) * optimize demote codegen a ton * optimize gl_HelperInvocation/gl_SampleMask * optimize "all lanes demoted" via HALT.any * optimize scheduling of stores/atomics/demotes in FS * optimize some texturing with helper invocations Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/42097>
2026-06-21 01:38:23 +02:00 · 2026-06-11 12:47:46 -04:00 · 2026-06-11 12:47:46 -04:00 · 9cc686ac72
commit 9cc686ac72
parent 52d4d47edc
16 changed files with 405 additions and 178 deletions
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@ -1039,7 +1039,6 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
   case nir_intrinsic_dpas_intel:
   case nir_intrinsic_convert_cmat_intel:
   case nir_intrinsic_load_coverage_mask_intel:
-   case nir_intrinsic_load_dispatch_mask_intel:
   case nir_intrinsic_isberd_nv:
   case nir_intrinsic_isbewr_nv:
   case nir_intrinsic_vild_nv:
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -2686,9 +2686,6 @@ system_value("simd_width_intel", 1)
 # IndirectDataStartAddress
 system_value("indirect_address_intel", 1)

-# The dispatch mask as provided in the FS payload.
-system_value("dispatch_mask_intel", 1)
-
 # The raw coverage mask as provided in the FS payload.
 # The semantics of it depend on the HW state.
 system_value("coverage_mask_intel", 1)
@ -2704,8 +2701,8 @@ intrinsic("load_reloc_const_intel", dest_comp=1, bit_sizes=[32],
          indices=[PARAM_IDX, BASE], flags=[CAN_ELIMINATE, CAN_REORDER])

 # Write a render target
-# src[] = { color, dual_color, src0_alpha, omask, depth, stencil, predicate }
-intrinsic("store_render_target_intel", [4, 4, 1, 1, 1, 1, 1], indices=[TARGET], bit_sizes=[32, 32, 32, 32, 32, 32, 1])
+# src[] = { color, dual_color, src0_alpha, omask, depth, stencil }
+intrinsic("store_render_target_intel", [4, 4, 1, 1, 1, 1], indices=[TARGET], bit_sizes=[32, 32, 32, 32, 32, 32])

 # Shuffle with an offset in bytes instead of a lane index.
 # src[] = { payload, lane offset in bytes }
--- a/src/intel/compiler/jay/jay_assign_flags.c
+++ b/src/intel/compiler/jay/jay_assign_flags.c
@ -72,10 +72,15 @@ assign_flag(struct flag_ra *ra,
   jay_def tmp = jay_alloc_def(ra->b, file, 1);

   unsigned num_flags = jay_num_regs(ra->b->shader, FLAG);
+   if (ra->b->shader->helpers_tracked) {
+      /* Helper tracking uses the last flag by definition */
+      num_flags--;
+   }
+
   tmp.reg = tie ? tie->reg : ballot ? 0 : ((ra->roundrobin++) % num_flags);

   /* Uniform access (via a UFLAG or an inverse-ballot) would clobber the zero
-    * for a ballot. We could refine this further but this should be ok for now.
+    * for a ballot. TODO: This needs to be reworked to get the flag back.
    */
   if (!ballot &&
       tmp.reg == 0 &&
@ -84,6 +89,8 @@ assign_flag(struct flag_ra *ra,
      assert(!tie);
      tmp.reg = 1;
      ra->roundrobin++;
+
+      assert(num_flags >= 2); /* XXX: Not always true, FIXME */
   }

   if (jay_index(canonical) < ra->nr_vars) {
@ -193,6 +200,16 @@ assign_block(struct flag_ra *ra)
         I->type = JAY_TYPE_U32;
         I->dst = canonicalize_flag(I->dst);
         continue;
+      } else if (I->op == JAY_OPCODE_SEND &&
+                 jay_send_skip_helpers(I) &&
+                 jay_is_no_mask(I)) {
+
+         /* jay_lower_helpers will clobber flag 0 to handle this case, see the
+          * logic there. Evict whatever was there.
+          */
+         ra->flag_to_global[0] = 0;
+         assert(!I->predication);
+         continue;
      } else if (I->type == JAY_TYPE_U1) {
         /* Boolean logic turns into bitwise logic on the canonical form */
         if (!jay_is_null(I->dst)) {
--- a/src/intel/compiler/jay/jay_from_nir.c
+++ b/src/intel/compiler/jay/jay_from_nir.c
@ -8,12 +8,14 @@
 #include "compiler/brw/brw_eu_defines.h"
 #include "compiler/brw/brw_nir.h"
 #include "compiler/brw/brw_sampler.h"
+#include "compiler/gen/gen_enums.h"
 #include "compiler/intel_nir.h"
 #include "compiler/intel_shader_enums.h"
 #include "compiler/list.h"
 #include "intel/dev/intel_debug.h"
 #include "mda/debug_archiver.h"
 #include "util/bitscan.h"
+#include "util/bitset.h"
 #include "util/lut.h"
 #include "util/macros.h"
 #include "util/u_math.h"
@ -81,10 +83,7 @@ struct nir_to_jay_state {
   const struct intel_device_info *devinfo;

   jay_builder bld;
-
-   jay_block *current_block;
-   jay_block *after_block;
-   jay_block *break_block;
+   jay_block *current_block, *after_block, *break_block, *exit_block;

   unsigned indent;
   bool needs_final_halt;
@ -832,19 +831,6 @@ scalars_equal(nir_scalar a, nir_scalar b)
           nir_scalar_as_uint(a) == nir_scalar_as_uint(b));
 }

-static void
-jay_emit_halt_target(struct nir_to_jay_state *nj)
-{
-   /* This final halt will re-enable the channels which got masked off by first
-    * HALT.
-    */
-   if (nj->needs_final_halt) {
-      /* This avoids re-emitting the halt after EOT send */
-      nj->needs_final_halt = false;
-      jay_HALT_TARGET(&nj->bld);
-   }
-}
-
 static void
 jay_emit_fb_write(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
 {
@ -860,8 +846,6 @@ jay_emit_fb_write(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
   const int target = MAX2(((signed) nir_intrinsic_target(intr)), 0);
   const bool last = !nir_instr_next(&intr->instr);

-   jay_emit_halt_target(nj);
-
   /* The hardware freaks out if we give it an omask without multisampling. */
   if (!b->shader->prog_data->fs.uses_omask) {
      omask = jay_null();
@ -941,15 +925,10 @@ jay_emit_fb_write(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
         srcs[len++] = jay_extract(packed, i);
   }

-   jay_inst *send =
-      jay_SEND(b, .sfid = GEN_SFID_RENDER_CACHE, .check_tdr = true,
-               .msg_desc = desc | (ex_desc << 32), .srcs = srcs, .nr_srcs = len,
-               .type = JAY_TYPE_U32, .eot = last, .split = split);
-
-   /* Handle the disable predicate. It is logically inverted. */
-   if (!nir_src_is_zero(intr->src[6])) {
-      jay_add_predicate(b, send, jay_negate(nj_src(intr->src[6])));
-   }
+   jay_SEND(b, .sfid = GEN_SFID_RENDER_CACHE, .check_tdr = true,
+            .msg_desc = desc | (ex_desc << 32), .srcs = srcs, .nr_srcs = len,
+            .type = JAY_TYPE_U32, .eot = last, .split = split,
+            .skip_helpers = true);
 }

 static enum lsc_data_size
@ -1572,19 +1551,6 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
      jay_MOV(b, dst, fs->coverage_mask);
      break;

-   case nir_intrinsic_load_dispatch_mask_intel: {
-      jay_def mask = jay_extract(nj->payload.u0, 15);
-
-      if (nj->s->dispatch_width == 32) {
-         /* TODO: Optimize */
-         jay_def hi = jay_extract(nj->payload.u1, 15);
-         mask = jay_BFI2_u32(b, 0xffff0000, hi, mask);
-      }
-
-      jay_MOV(b, dst, mask);
-      break;
-   }
-
   case nir_intrinsic_load_subgroup_invocation: {
      jay_def lid = jay_alloc_def(b, UGPR, s->dispatch_width / 2);
      jay_LANE_ID_8(b, jay_extract_range(lid, 0, 4));
@ -1600,8 +1566,16 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
   }

   case nir_intrinsic_demote:
+      jay_DEMOTE_u32(b, jay_null(), jay_null());
+      break;
   case nir_intrinsic_demote_if:
-      /* TODO: Already lowered, but need to implement for performance. */
+      jay_DEMOTE(b, JAY_TYPE_U1, nj_src(intr->src[0]), 0)->conditional_mod =
+         GEN_CONDITION_NE;
+      break;
+
+   case nir_intrinsic_load_helper_invocation:
+   case nir_intrinsic_is_helper_invocation:
+      jay_HELPER_SEL(b, dst, 1, 0);
      break;

   case nir_intrinsic_ddx:
@ -2455,7 +2429,8 @@ jay_emit_texture(struct nir_to_jay_state *nj, nir_tex_instr *tex)
            .ex_desc = desc_ex_src, .header = header, .srcs = payload,
            .nr_srcs = n_sources, .type = JAY_TYPE_U32,
            .src_type = { src_type }, .dst = tmp, .uniform = payload_uniform,
-            .bindless = surface_bindless, .pure = true);
+            .bindless = surface_bindless, .pure = true,
+            .skip_helpers = tex->skip_helpers);

   /* If we sampled into a temporary, copy out to the final */
   if (residency) {
@ -2484,7 +2459,8 @@ jay_emit_jump(struct nir_to_jay_state *nj, nir_jump_instr *instr)
      break;
   case nir_jump_halt:
      nj->needs_final_halt = true;
-      jay_HALT(&nj->bld);
+      jay_block_add_successor(nj->current_block, nj->exit_block, GPR);
+      jay_HALT(&nj->bld, false);
      break;
   case nir_jump_return:
      /* Should be lowered */
@ -2754,8 +2730,16 @@ static void
 jay_emit_eot(struct nir_to_jay_state *nj)
 {
   jay_builder *b = &nj->bld;
+   b->cursor = jay_after_block(nj->exit_block);

-   jay_emit_halt_target(nj);
+   /* Jump target for HALT */
+   if (nj->needs_final_halt) {
+      if (nj->s->stage == MESA_SHADER_FRAGMENT) {
+         assert(nj->s->helpers_tracked);
+      } else {
+         jay_HALT_TARGET(&nj->bld);
+      }
+   }

   if (mesa_shader_stage_is_compute(nj->nir->info.stage)) {
      jay_def u0 = nj->payload.u0;
@ -2773,12 +2757,18 @@ jay_emit_eot(struct nir_to_jay_state *nj)
               .uniform = true);
   } else if (nj->nir->info.stage == MESA_SHADER_VERTEX ||
              nj->nir->info.stage == MESA_SHADER_TESS_EVAL) {
-      jay_block *block = jay_last_block(nj->f);
+      jay_block *block = jay_last_source_block(nj->f);
      jay_inst *I = jay_last_inst(block);

+      assert(!nj->needs_final_halt && "halt not supported with URB");
+
      /* TODO: What if this isn't the case? Do we need a no-op store...? */
      assert(I && I->op == JAY_OPCODE_SEND && jay_send_sfid(I) == GEN_SFID_URB);
+
+      /* Pluck out the final SEND and put it in the exit block */
      jay_set_send_eot(I, true);
+      jay_remove_instruction(I);
+      jay_builder_insert(b, I);
   }
 }

@ -3012,6 +3002,14 @@ setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
      }
   }

+   /* INIT_HELPERS reads UGPRs but has no SSA write. Therefore to minimize
+    * pressure, we want to hoist it as much as possible.
+    */
+   if (nj->s->helpers_tracked) {
+      jay_INIT_HELPERS(&nj->bld, jay_extract(nj->payload.u0, 15),
+                       payload_u1(nj, 15, 1));
+   }
+
   for (unsigned i = 0; i < ARRAY_SIZE(split_gprs); ++i) {
      if (!jay_is_null(split[i]) && split_gprs[i].def->file == UGPR) {
         *(split_gprs[i].def) =
@ -3178,7 +3176,11 @@ jay_from_nir_function(const struct intel_device_info *devinfo,
      jay_setup_payload(&nj);
   }

+   nj.exit_block = jay_create_block(&nj);
   jay_emit_cf_list(&nj, &impl->body);
+   jay_block_add_successor(nj.current_block, nj.exit_block, GPR);
+
+   list_addtail(&nj.exit_block->link, &f->blocks);
   jay_emit_eot(&nj);
   jay_remove_unreachable_blocks(f);
 }
@ -3216,8 +3218,9 @@ jay_compile(const struct intel_device_info *devinfo,
      INTEL_DEBUG(intel_debug_flag_for_shader_stage(nir->info.stage)) &&
      !(nir->info.internal || NIR_DEBUG(PRINT_INTERNAL));

+   bool track_helpers = false;
   unsigned simd_width =
-      jay_process_nir(devinfo, nir, prog_data, key, archiver);
+      jay_process_nir(devinfo, nir, prog_data, key, archiver, &track_helpers);

   if (debug) {
      /* We can't use nir_print_shader since it reindexes SSA defs. */
@ -3232,6 +3235,7 @@ jay_compile(const struct intel_device_info *devinfo,
   s->devinfo = devinfo;
   s->prog_data = prog_data;
   s->archiver = archiver;
+   s->helpers_tracked = track_helpers;

   nir_foreach_function_impl(impl, nir) {
      jay_from_nir_function(devinfo, nir, s, impl);
@ -3288,6 +3292,10 @@ jay_compile(const struct intel_device_info *devinfo,
      JAY_PASS(s, jay_insert_payload_swizzle);
   }

+   if (s->stage == MESA_SHADER_FRAGMENT && s->helpers_tracked) {
+      JAY_PASS(s, jay_lower_helpers);
+   }
+
   if (!(jay_debug & JAY_DBG_NOOPT)) {
      /* jay_assign_accumulators uses a conservative liveness analysis for
       * predication, so assign accumulators before predicating for better
--- a/src/intel/compiler/jay/jay_ir.h
+++ b/src/intel/compiler/jay/jay_ir.h
@ -787,6 +787,12 @@ typedef struct jay_shader {
   unsigned scratch_size;
   unsigned payload_gprs, payload_ugprs, push_grfs;

+   /**
+    * In a fragment shader, whether a helper invocation flag is tracked. Flag RA
+    * must reserve the relevant flag.
+    */
+   bool helpers_tracked;
+
   /**
    * Ralloc linear context. Since we don't typically free as we go,
    * most allocations should go through this context for efficiency.
@ -1126,7 +1132,7 @@ jay_new_block(jay_function *f)
 static inline bool
 jay_op_is_control_flow(enum jay_opcode op)
 {
-   return op >= JAY_OPCODE_BRD && op <= JAY_OPCODE_LOOP_ONCE;
+   return op >= JAY_OPCODE_BRD && op <= JAY_OPCODE_HALT;
 }

 /**
@ -1201,6 +1207,9 @@ jay_first_predecessor(jay_block *block, enum jay_file file)
 #define jay_foreach_block_rev(f, v)                                            \
   list_for_each_entry_rev(jay_block, v, &f->blocks, link)

+#define jay_foreach_block_safe_rev(f, v)                                       \
+   list_for_each_entry_safe_rev(jay_block, v, &f->blocks, link)
+
 #define jay_foreach_block_from(f, from, v)                                     \
   list_for_each_entry_from(jay_block, v, from, &f->blocks, link)

@ -1238,7 +1247,7 @@ jay_first_predecessor(jay_block *block, enum jay_file file)
      jay_foreach_inst_in_block_safe(block, v)

 #define jay_foreach_inst_in_func_safe_rev(func, block, v)                      \
-   jay_foreach_block_rev(func, block)                                          \
+   jay_foreach_block_safe_rev(func, block)                                     \
      jay_foreach_inst_in_block_safe_rev(block, v)

 #define jay_foreach_inst_in_shader(s, func, inst)                              \
@ -1355,6 +1364,15 @@ jay_last_block(jay_function *f)
      return list_last_entry(&f->blocks, jay_block, link);
 }

+static inline jay_block *
+jay_last_source_block(jay_function *f)
+{
+   if (list_is_empty(&f->blocks) || list_is_singular(&f->blocks))
+      return NULL;
+   else
+      return list_last_entry(&jay_last_block(f)->link, jay_block, link);
+}
+
 static inline jay_inst *
 jay_last_inst(jay_block *block)
 {
@ -1373,11 +1391,14 @@ jay_next_block(jay_block *block)
 static inline void
 jay_block_add_successor(jay_block *block, jay_block *succ, enum jay_file file)
 {
+   /* Prune duplicate successors so the caller doesn't need to worry */
   jay_block **succs = jay_successors(block, file);
-   unsigned i = succs[0] ? 1 : 0;
+   if (succs[0] == succ || succs[1] == succ) {
+      return;
+   }

-   assert(succ && succs[0] != succ && succs[1] != succ);
-   assert(succs[i] == NULL && "at most 2 successors");
+   unsigned i = succs[0] ? 1 : 0;
+   assert(succ && succs[i] == NULL && "at most 2 successors");

   succs[i] = succ;
   util_dynarray_append(jay_predecessors(succ, file), block);
--- a/src/intel/compiler/jay/jay_lower_helpers.c
+++ b/src/intel/compiler/jay/jay_lower_helpers.c
@ -0,0 +1,191 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#include "compiler/gen/gen_enums.h"
+#include "util/list.h"
+#include "util/u_dynarray.h"
+#include "jay_builder.h"
+#include "jay_builder_opcodes.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+struct ctx {
+   jay_block *last_source_block;
+   jay_def helper_flag;
+   bool halted, uses_terminate;
+   unsigned instr_left;
+};
+
+/*
+ * Takes src, a linked list containing the element pivot in the middle, and dst
+ * an empty list. Moves all elements up to and including pivot from src to dst,
+ * leaving the rest in dst. Semantically equivalent to a loop of list_move but
+ * O(1) time regardless of the position of pivot in the list.
+ */
+static void
+list_partition(struct list_head *src,
+               struct list_head *dst,
+               struct list_head *pivot)
+{
+   /* dst runs from src[0] to pivot */
+   dst->next = src->next;
+   dst->prev = pivot;
+   dst->next->prev = dst;
+
+   /* src runs from pivot[1:] to end of src */
+   src->next = pivot->next;
+   src->prev = src->prev;
+
+   src->next->prev = src;
+   pivot->next = dst;
+
+   list_validate(dst);
+   list_validate(src);
+}
+
+static void
+process_block(struct ctx *ctx, jay_builder *b, jay_block *block)
+{
+   jay_foreach_inst_in_block_safe_rev(block, I) {
+      b->cursor = jay_before_inst(I);
+
+      if (I->op == JAY_OPCODE_INIT_HELPERS) {
+         jay_NOT(b, ctx->helper_flag, I->src[0])->type = JAY_TYPE_U16;
+
+         if (!jay_is_null(I->src[1])) {
+            jay_def hi = ctx->helper_flag;
+            hi.hi = true;
+            jay_NOT(b, hi, I->src[1])->type = JAY_TYPE_U16;
+         }
+
+         jay_remove_instruction(I);
+      } else if (I->op == JAY_OPCODE_HALT) {
+         ctx->halted = ctx->uses_terminate = true;
+      } else if (I->op == JAY_OPCODE_DEMOTE) {
+         enum gen_condition cond = I->conditional_mod;
+         jay_def x = I->src[0], y = I->src[1];
+
+         /* Unconditional discard */
+         if (!cond) {
+            cond = GEN_CONDITION_EQ;
+            I->type = JAY_TYPE_U32;
+            x = y = jay_bare_reg(UGPR, 0);
+         }
+
+         jay_inst *cmp = jay_CMP(b, I->type, cond, ctx->helper_flag, x, y);
+         jay_add_predicate(b, cmp, jay_negate(ctx->helper_flag));
+         jay_remove_instruction(I);
+
+         /* We are allowed to halt after a demote if all lanes are inactive
+          * for performance, but it's not required for correctness. Only do
+          * it if it's likely profitable.
+          *
+          * We assume a shader either uses SPIR-V demote or terminate, but
+          * not both. If the shader uses terminate, there will be an actual
+          * HALT instruction after us so we don't bother with a second HALT
+          * here. Strictly there's a corner case here if all non-helpers are
+          * terminated but lanes spawned as helpers are not terminated, but
+          * this is probably reasonable as a tradeoff.
+          */
+         if (ctx->instr_left > 6 && !ctx->uses_terminate) {
+            jay_inst *halt = jay_HALT(b, true);
+            halt = jay_add_predicate(b, halt, ctx->helper_flag);
+            ctx->halted = true;
+
+            jay_block *split = jay_new_block(b->func);
+            split->indent = block->indent;
+
+            list_partition(&block->instructions, &split->instructions,
+                           &halt->link);
+            list_addtail(&split->link, &block->link);
+
+            /* The split block either falls through or jumps to the exit */
+            for (unsigned file = GPR; file <= UGPR; ++file) {
+               jay_foreach_predecessor(block, pred, file) {
+                  jay_block **succs = jay_successors(*pred, file);
+                  unsigned idx = succs[0] == block ? 0 : 1;
+                  succs[idx] = split;
+               }
+            }
+            typed_memcpy(&split->physical_preds, &block->physical_preds, 1);
+            typed_memcpy(&split->logical_preds, &block->logical_preds, 1);
+            util_dynarray_init(&block->physical_preds, block);
+            util_dynarray_init(&block->logical_preds, block);
+
+            jay_block_add_successor(split, block, GPR);
+            jay_block_add_successor(split, jay_last_block(b->func), GPR);
+            return;
+         }
+      } else if (I->op == JAY_OPCODE_HELPER_SEL) {
+         jay_SEL(b, JAY_TYPE_U32, I->dst, I->src[0], I->src[1],
+                 ctx->helper_flag);
+         jay_remove_instruction(I);
+      } else if (I->op == JAY_OPCODE_SEND && jay_send_skip_helpers(I)) {
+         if (jay_is_no_mask(I)) {
+            /* jay_assign_flags ensured this is free for us, see logic there */
+            jay_def t = jay_bare_reg(UFLAG, 0);
+            jay_inst *not = jay_NOT(b, jay_null(), ctx->helper_flag);
+            not->type = JAY_TYPE_U | b->shader->dispatch_width;
+            jay_set_conditional_mod(b, not, t, GEN_CONDITION_NE);
+            jay_add_predicate(b, I, t);
+         } else {
+            jay_add_predicate(b, I, jay_negate(ctx->helper_flag));
+         }
+      }
+
+      ++ctx->instr_left;
+   }
+}
+
+void
+jay_lower_helpers(jay_shader *shader)
+{
+   jay_function *entrypoint = jay_shader_get_entrypoint(shader);
+   jay_block *exit_block = jay_last_block(entrypoint);
+   jay_block *last_source_block = jay_last_source_block(entrypoint);
+
+   /* By ABI with jay_assign_flags, the last flag is used to track helpers */
+   assert(shader->helpers_tracked);
+   unsigned helper_flag_no = jay_num_regs(shader, FLAG) - 1;
+   struct ctx ctx = { .helper_flag = jay_bare_reg(FLAG, helper_flag_no) };
+   jay_builder b = jay_init_builder(entrypoint, jay_after_block(exit_block));
+
+   jay_foreach_block_rev(entrypoint, block) {
+      process_block(&ctx, &b, block);
+   }
+
+   /* Fill out the exit block */
+   b.cursor = jay_after_block(exit_block);
+   if (ctx.halted) {
+      jay_HALT_TARGET(&b);
+   }
+
+   /* Try to pluck out the last instruction and use it for EOT. This breaks SSA
+    * dominance invariants but that's why this is a post-RA, post-sched pass.
+    * Only SWSB has to deal with the resulting mess.
+    *
+    * There may be no such send (in case of an unconditional terminate). In that
+    * case, insert a predicated-out null RT write to use for EOT.
+    */
+   jay_inst *send = jay_last_inst(last_source_block);
+   if (send && send->op == JAY_OPCODE_SEND && jay_send_eot(send)) {
+      jay_remove_instruction(send);
+      jay_builder_insert(&b, send);
+   } else {
+      jay_def dummy = jay_bare_reg(GPR, 0);
+      dummy.num_values_m1 = 4 - 1;
+
+      unsigned op = shader->dispatch_width == 32 ?
+                       XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE :
+                       BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
+      uint64_t desc = brw_fb_write_desc(shader->devinfo, 0, op, true, false);
+      uint64_t ex_desc = (1 << 20) /* null rt */;
+
+      send = jay_SEND(&b, .sfid = GEN_SFID_RENDER_CACHE, .check_tdr = true,
+                      .msg_desc = desc | (ex_desc << 32), .nr_srcs = 1,
+                      .srcs = &dummy, .type = JAY_TYPE_U32, .eot = true);
+      send = jay_add_predicate(&b, send, jay_negate(ctx.helper_flag));
+   }
+}
--- a/src/intel/compiler/jay/jay_lower_pre_ra.c
+++ b/src/intel/compiler/jay/jay_lower_pre_ra.c
@ -96,7 +96,7 @@ try_swap_src01(jay_inst *I)
   if (I->op == JAY_OPCODE_SEL) {
      /* sel(a, b, p) = sel(b, a, !p) */
      I->src[2].negate ^= true;
-   } else if (I->op == JAY_OPCODE_CMP) {
+   } else if (I->op == JAY_OPCODE_CMP || I->op == JAY_OPCODE_DEMOTE) {
      I->conditional_mod = gen_condition_swap_sources(I->conditional_mod);
   } else if (I->op == JAY_OPCODE_BFN) {
      jay_set_bfn_ctrl(I, util_lut3_swap_sources(jay_bfn_ctrl(I), 0, 1));
--- a/src/intel/compiler/jay/jay_nir.c
+++ b/src/intel/compiler/jay/jay_nir.c
@ -46,21 +46,6 @@ nj_index_ssa_defs(nir_shader *nir)
   }
 }

-static bool
-lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_)
-{
-   if (intr->intrinsic != nir_intrinsic_load_helper_invocation)
-      return false;
-
-   /* TODO: Is this right for multisampling? */
-   b->cursor = nir_before_instr(&intr->instr);
-   nir_def *active =
-      nir_inot(b, nir_inverse_ballot(b, nir_load_dispatch_mask_intel(b)));
-
-   nir_def_replace(&intr->def, active);
-   return true;
-}
-
 static bool
 lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
 {
@ -178,8 +163,7 @@ insert_rt_store(nir_builder *b,
                nir_def *src0_colour,
                nir_def *depth,
                nir_def *stencil,
-                nir_def *sample_mask,
-                nir_def *disable)
+                nir_def *sample_mask)
 {
   bool null_rt = target < 0;

@ -197,8 +181,7 @@ insert_rt_store(nir_builder *b,
   nir_def *src0_alpha = nir_channel_or_undef(b, src0_colour ?: colour, 3);

   nir_store_render_target_intel(b, colour, dual_colour, src0_alpha,
-                                 sample_mask, depth, stencil, disable,
-                                 .target = target);
+                                 sample_mask, depth, stencil, .target = target);
 }

 static void
@ -216,14 +199,10 @@ lower_fragment_outputs(nir_function_impl *impl,

   nir_def *undef = nir_undef(b, 1, 32);

-   nir_def *disable = b->shader->info.fs.uses_discard ?
-                         nir_is_helper_invocation(b, 1) :
-                         nir_imm_false(b);
-
   if (ctx.dual_blend) {
      insert_rt_store(b, 0, ctx.colour[0], ctx.colour[1], NULL,
                      ctx.depth ?: undef, ctx.stencil ?: undef,
-                      ctx.sample_mask ?: undef, disable);
+                      ctx.sample_mask ?: undef);
      return;
   }

@ -239,83 +218,13 @@ lower_fragment_outputs(nir_function_impl *impl,
      if (ctx.colour[i]) {
         insert_rt_store(b, i, ctx.colour[i], NULL,
                         i > 0 ? ctx.colour[0] : NULL, ctx.depth ?: undef,
-                         ctx.stencil ?: undef, ctx.sample_mask ?: undef,
-                         disable);
+                         ctx.stencil ?: undef, ctx.sample_mask ?: undef);
      }
   }

   insert_rt_store(b, last, last >= 0 ? ctx.colour[last] : NULL, NULL,
                   last > 0 ? ctx.colour[0] : NULL, ctx.depth ?: undef,
-                   ctx.stencil ?: undef, ctx.sample_mask ?: undef, disable);
-}
-
-/**
- * Drop render target stores with unconditional discards.
- */
-static bool
-opt_unconditional_discards(nir_shader *nir)
-{
-   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
-   nir_block *block = nir_impl_last_block(impl);
-
-   bool progress = false;
-   bool any_remaining_rt_writes = false;
-
-   nir_foreach_instr_reverse_safe(instr, block) {
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-
-      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-      if (intr->intrinsic == nir_intrinsic_store_render_target_intel) {
-         nir_scalar discard = nir_scalar_resolved(intr->src[6].ssa, 0);
-         if (nir_scalar_is_const(discard) && nir_scalar_as_bool(discard)) {
-            /* Drop store with unconditional discard */
-            nir_instr_remove(instr);
-            progress = true;
-         } else {
-            /* This RT store might actually happen */
-            any_remaining_rt_writes = true;
-         }
-      } else if ((intr->intrinsic == nir_intrinsic_demote ||
-                  intr->intrinsic == nir_intrinsic_terminate) &&
-                 !any_remaining_rt_writes) {
-         /* Delete unconditional demotes/terminates in the end block... */
-         nir_instr_remove(instr);
-         progress = true;
-      } else {
-         /* ...but stop if we find an intrinsic that has a side-effect */
-         const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
-         if (!(info->flags & NIR_INTRINSIC_CAN_ELIMINATE))
-            break;
-      }
-   }
-
-   /* See if discards still exist in the program and flag accordingly */
-   nir->info.fs.uses_discard = false;
-
-   nir_foreach_block(block, impl) {
-      nir_foreach_instr(instr, block) {
-         if (instr->type == nir_instr_type_intrinsic) {
-            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-            if (intr->intrinsic == nir_intrinsic_demote ||
-                intr->intrinsic == nir_intrinsic_demote_if ||
-                intr->intrinsic == nir_intrinsic_terminate ||
-                intr->intrinsic == nir_intrinsic_terminate_if)
-               nir->info.fs.uses_discard = true;
-         }
-      }
-   }
-
-   /* If we eliminated all RT stores, add a Null RT store to end the thread. */
-   if (!any_remaining_rt_writes) {
-      nir_builder b = nir_builder_at(nir_after_impl(impl));
-      nir_def *undef = nir_undef(&b, 1, 32);
-      insert_rt_store(&b, -1, NULL, NULL, NULL, undef, undef, undef,
-                      nir_imm_true(&b));
-   }
-
-   return nir_progress(progress, impl, nir_metadata_control_flow);
+                   ctx.stencil ?: undef, ctx.sample_mask ?: undef);
 }

 unsigned
@ -323,7 +232,8 @@ jay_process_nir(const struct intel_device_info *devinfo,
                nir_shader *nir,
                union brw_any_prog_data *prog_data,
                union brw_any_prog_key *key,
-                debug_archiver *archiver)
+                debug_archiver *archiver,
+                bool *track_helpers)
 {
   enum mesa_shader_stage stage = nir->info.stage;
   struct brw_compiler compiler = { .devinfo = devinfo };
@ -475,10 +385,14 @@ jay_process_nir(const struct intel_device_info *devinfo,

      lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo,
                             key->fs.nr_color_regions, simd_width);
-      JAY_NIR_PASS(nir_lower_helper_writes, true);
-      JAY_NIR_PASS(nir_lower_is_helper_invocation);
-      JAY_NIR_PASS(nir_shader_intrinsics_pass, lower_helper_invocation,
-                   nir_metadata_control_flow, NULL);
+
+      /* nir_lower_terminate_to_demote will hamper our ability to schedule
+       * terminates (since it turns them into real control flow), so run
+       * nir_opt_move_discards_to_top first as a prepass. That should help
+       * scheduling demotes too (which is more important).
+       */
+      JAY_NIR_PASS(nir_opt_move_discards_to_top);
+      JAY_NIR_PASS(nir_lower_terminate_to_demote);

      if (key->fs.alpha_to_coverage != INTEL_NEVER) {
         /* Run constant fold optimization in order to get the correct source
@ -495,8 +409,6 @@ jay_process_nir(const struct intel_device_info *devinfo,
       */
      brw_nir_optimize(pt);

-      NIR_PASS(_, nir, opt_unconditional_discards);
-
      // TODO
      // JAY_NIR_PASS(brw_nir_move_interpolation_to_top);

@ -556,10 +468,31 @@ jay_process_nir(const struct intel_device_info *devinfo,

   /* Run divergence analysis at the end */
   nir_sweep(nir);
-   nj_index_ssa_defs(nir);
   nir_divergence_analysis(nir);

-   if (stage != MESA_SHADER_FRAGMENT)
+   if (stage == MESA_SHADER_FRAGMENT) {
+      /* Certain features require tracking helpers for correctness */
+      nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+      *track_helpers |= nir->info.fs.uses_discard || nir->info.writes_memory;
+      *track_helpers |= BITSET_TEST(nir->info.system_values_read,
+                                    SYSTEM_VALUE_HELPER_INVOCATION);
+
+      /* ...but this is more subtle. nir_opt_load_skip_helpers flags texturing
+       * operations that we can skip for bandwidth savings.  We need divergence
+       * info for this, so we run late.
+       *
+       * We may or may not want to force track_helpers on if this makes
+       * progress. Possibly driconf'ing on furmark makes sense.
+       */
+      struct nir_opt_load_skip_helpers_options skip_helpers = {
+         .no_add_divergence = true
+      };
+      JAY_NIR_PASS(nir_opt_load_skip_helpers, &skip_helpers);
+   } else {
      jay_populate_prog_data(devinfo, nir, prog_data, key, nr_packed_regs);
+   }
+
+   /* This must be the very last pass since nir_print itself will reindex! */
+   nj_index_ssa_defs(nir);
   return simd_width;
 }
--- a/src/intel/compiler/jay/jay_opcodes.py
+++ b/src/intel/compiler/jay/jay_opcodes.py
@ -81,7 +81,6 @@ op('bfrev', 1, 'u32', Props.NEGATE)
 op('cbit',  1, 'u32')
 op('cmp',   2, 'u32', Props.NEGATE | Props.CMOD)

-
 # With an 8/16-bit type, `index` specifies the element index of the source
 # within the 32-bit word. For example, if src_type == U16 and index == 1, this
 # converts the upper 16-bits of the input.
@ -134,9 +133,11 @@ op('schedule_barrier', 0, None, Props.NO_DEST)

 for n in ['brd', 'illegal', 'goto', 'join', 'if', 'else',
          'endif', 'while', 'break', 'cont', 'call', 'calla', 'jmpi', 'ret',
-          'loop_once', 'halt', 'halt_target']:
+          'loop_once', 'halt_target']:
    op(n, 0, None, Props.NO_DEST)

+op('halt', 0, None, Props.NO_DEST, ['bool predicate_all'])
+
 op('send', 4, None, Props.SIDE_EFFECTS, [
    'gen_sfid sfid',
    'uint8_t sbid',
@ -234,6 +235,15 @@ op('dpas', 3, 'u32', 0, [
    'uint8_t pad[3]',
 ])

+# Initialize helper invocations. Takes 16-bit halves of the dispatch mask.
+op('init_helpers', 2, 'u16', Props.NO_DEST)
+
+# Compare the arguments and demote based on the result.
+op('demote', 2, 'u1 u16 u32 u64 s16 s32 s64 f16 f32 f64', Props.NEGATE | Props.NO_DEST)
+
+# Equivalent to NIR bcsel(@is_helper_invocation, source 0, source 1)
+op('helper_sel', 2, 'u1 u32')
+
 OPCODES = _opcodes

 ENUMS: 'Mapping[str, tuple[str, list[str]]]' = {
--- a/src/intel/compiler/jay/jay_opt_propagate.c
+++ b/src/intel/compiler/jay/jay_opt_propagate.c
@ -3,6 +3,7 @@
 * SPDX-License-Identifier: MIT
 */

+#include "compiler/gen/gen_enums.h"
 #include "util/bitset.h"
 #include "util/lut.h"
 #include "jay_builder.h"
@ -121,6 +122,30 @@ propagate_not(jay_inst *I, unsigned s, jay_inst *mod)
   }
 }

+/**
+ * Fuse demote(cmp(x, y) != 0) to demote(x CMP y).
+ */
+static void
+fuse_demote(jay_inst *demote, jay_inst **defs)
+{
+   if (!(jay_is_ssa(demote->src[0]) &&
+         jay_is_zero(demote->src[1]) &&
+         demote->type == JAY_TYPE_U1 &&
+         demote->conditional_mod == GEN_CONDITION_NE)) {
+      return;
+   }
+
+   jay_inst *cmp = defs[jay_index(demote->src[0])];
+   if (cmp->op != JAY_OPCODE_CMP || cmp->predication) {
+      return;
+   }
+
+   demote->conditional_mod = cmp->conditional_mod;
+   demote->src[0] = cmp->src[0];
+   demote->src[1] = cmp->src[1];
+   demote->type = cmp->type;
+}
+
 static void
 propagate_forwards(jay_function *f)
 {
@ -156,6 +181,11 @@ propagate_forwards(jay_function *f)
      if (I->op == JAY_OPCODE_PHI_SRC || I->op == JAY_OPCODE_SEND)
         continue;

+      /* We fuse demote forwards & upfront to avoid fighting cmod prop */
+      if (I->op == JAY_OPCODE_DEMOTE) {
+         fuse_demote(I, defs);
+      }
+
      jay_foreach_ssa_src(I, s) {
         /* Copy propagate whole vectors */
         jay_def src = I->src[s];
--- a/src/intel/compiler/jay/jay_private.h
+++ b/src/intel/compiler/jay/jay_private.h
@ -34,7 +34,8 @@ unsigned jay_process_nir(const struct intel_device_info *devinfo,
                         nir_shader *nir,
                         union brw_any_prog_data *prog_data,
                         union brw_any_prog_key *key,
-                         debug_archiver *archiver);
+                         debug_archiver *archiver,
+                         bool *track_helpers);

 void jay_compute_liveness(jay_function *f);
 void jay_calculate_register_demands(jay_function *f);
@ -84,6 +85,7 @@ void jay_schedule_pressure(jay_shader *s);

 void jay_lower_pre_ra(jay_shader *s);
 void jay_lower_post_ra(jay_shader *s);
+void jay_lower_helpers(jay_shader *s);
 void jay_lower_spill(jay_function *func);
 void jay_lower_simd_width(jay_shader *s);
 void jay_lower_scoreboard(jay_shader *s);
--- a/src/intel/compiler/jay/jay_schedule.c
+++ b/src/intel/compiler/jay/jay_schedule.c
@ -100,9 +100,17 @@ populate_dag(struct sched_ctx *ctx,
         address = ctx->dag.node;
      }

-      /* Serialize side effects for now */
+      /* Serialize side effects for now, including SENDs which need to be
+       * predicated away after a demote.
+       */
      if ((I->op == JAY_OPCODE_SEND && !jay_send_pure(I)) ||
-          I->op == JAY_OPCODE_SCHEDULE_BARRIER) {
+          I->op == JAY_OPCODE_SCHEDULE_BARRIER ||
+          I->op == JAY_OPCODE_INIT_HELPERS ||
+          I->op == JAY_OPCODE_DEMOTE ||
+          I->op == JAY_OPCODE_HELPER_SEL ||
+          (I->op == JAY_OPCODE_SEND &&
+           func->shader->helpers_tracked &&
+           jay_send_skip_helpers(I))) {

         jay_dag_add_edge(&ctx->dag, sidefx);
         sidefx = ctx->dag.node;
--- a/src/intel/compiler/jay/jay_simd_width.c
+++ b/src/intel/compiler/jay/jay_simd_width.c
@ -22,6 +22,7 @@ max_simd_width(const jay_shader *shader, const jay_inst *I)
       I->op == JAY_OPCODE_EXTRACT_BYTE_PER_8LANES ||
       I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS ||
       I->op == JAY_OPCODE_DESWIZZLE_ODD ||
+       I->op == JAY_OPCODE_INIT_HELPERS ||
       I->op == JAY_OPCODE_MUL_32 ||
       I->op == JAY_OPCODE_SHUFFLE ||
       I->op == JAY_OPCODE_ZIP_UGPR16) {
--- a/src/intel/compiler/jay/jay_to_binary.c
+++ b/src/intel/compiler/jay/jay_to_binary.c
@ -79,7 +79,7 @@ to_gen_operand(
   gen_operand R;
   unsigned reg = d.reg, count = jay_num_values(d);
   unsigned offset_B = 0, grf = 0;
-   assert(!hi || d.file == GPR);
+   assert(!hi || d.file == GPR || d.file == FLAG);

   if (count && (d.file == GPR || d.file == UGPR)) {
      struct jay_register_block block =
@ -189,7 +189,8 @@ to_gen_operand(
       * SIMD1 instructions and are never SIMD split.
       */
      assert(simd_offs == 0 || idx >= 0);
-      unsigned offs_B = d.reg * (f->shader->dispatch_width / 8);
+      unsigned offs_B =
+         (d.reg * (f->shader->dispatch_width / 8)) + (hi ? 2 : 0);
      R = gen_flag(offs_B / 2);
   } else if (d.file == J_ADDRESS) {
      R = gen_address(d.reg);
@ -580,6 +581,14 @@ emit(struct jay_codegen *jc,
      }
      break;

+   case JAY_OPCODE_HALT:
+      if (jay_halt_predicate_all(I)) {
+         assert(I->predication);
+         gen->pred_control =
+            jc->devinfo->ver >= 20 ? GEN_PREDICATE_XE2_ALL : GEN_PREDICATE_ALLV;
+      }
+      break;
+
   case JAY_OPCODE_HALT_TARGET:
      /* HALT temporarily disables channels, and the same instruction is used
       * to re-enable them: once all channels are disabled, then they are
--- a/src/intel/compiler/jay/jay_validate.c
+++ b/src/intel/compiler/jay/jay_validate.c
@ -33,7 +33,9 @@ block_state_for_inst(jay_inst *I)
   if (I->op == JAY_OPCODE_PHI_DST || I->op == JAY_OPCODE_PRELOAD) {
      return STATE_PHI_DST;
   } else if (I->op == JAY_OPCODE_PHI_SRC ||
-              (jay_op_is_control_flow(I->op) && I->op != JAY_OPCODE_ELSE)) {
+              (jay_op_is_control_flow(I->op) &&
+               I->op != JAY_OPCODE_ELSE &&
+               I->op != JAY_OPCODE_HALT_TARGET)) {
      return STATE_LATE;
   } else {
      return STATE_NORMAL;
@ -238,10 +240,6 @@ validate_inst(struct validate_state *validate, jay_inst *I)
   validate_flagness(validate, I->dst, I->type, "destination");
   validate_flagness(validate, I->cond_flag, JAY_TYPE_U1, "cond_flag");

-   CHECK(!I->conditional_mod ||
-         !jay_is_null(I->cond_flag) ||
-         I->op == JAY_OPCODE_CSEL);
-
   /* These assumptions are baked into the definition of broadcast_flag and
    * required to ensure correctness with the lane masking.
    */
@ -256,8 +254,10 @@ validate_inst(struct validate_state *validate, jay_inst *I)
   CHECK(I->cond_flag.file != FLAG || I->dst.file != UGPR);

   /* Standard modifiers only allowed on some instructions */
-   CHECK(!I->conditional_mod || opinfo->cmod || I->op == JAY_OPCODE_CSEL);
   CHECK(!I->saturate || opinfo->sat);
+   CHECK(!I->conditional_mod ||
+         (I->op == JAY_OPCODE_CSEL || I->op == JAY_OPCODE_DEMOTE) ||
+         (!jay_is_null(I->cond_flag) && opinfo->cmod));

   unsigned num_srcs = I->num_srcs;

--- a/src/intel/compiler/jay/meson.build
+++ b/src/intel/compiler/jay/meson.build
@ -54,6 +54,7 @@ libintel_compiler_jay_files = files(
  'jay_ir.h',
  'jay_insert_fp_mode.c',
  'jay_liveness.c',
+  'jay_lower_helpers.c',
  'jay_lower_post_ra.c',
  'jay_lower_pre_ra.c',
  'jay_lower_scoreboard.c',