From 9cc686ac720c05d935074a4e54f76eca3dc0c83c Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Date: Thu, 11 Jun 2026 12:47:46 -0400
Subject: [PATCH] jay: rewrite demote/terminate/helper/halt handling

* implement terminate
* fix HALT brokenness on all shader stages (we need a real end block)
* optimize demote codegen a ton
* optimize gl_HelperInvocation/gl_SampleMask
* optimize "all lanes demoted" via HALT.any
* optimize scheduling of stores/atomics/demotes in FS
* optimize some texturing with helper invocations

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/42097>
---
 src/compiler/nir/nir_divergence_analysis.c |   1 -
 src/compiler/nir/nir_intrinsics.py         |   7 +-
 src/intel/compiler/jay/jay_assign_flags.c  |  19 +-
 src/intel/compiler/jay/jay_from_nir.c      | 102 ++++++-----
 src/intel/compiler/jay/jay_ir.h            |  31 +++-
 src/intel/compiler/jay/jay_lower_helpers.c | 191 +++++++++++++++++++++
 src/intel/compiler/jay/jay_lower_pre_ra.c  |   2 +-
 src/intel/compiler/jay/jay_nir.c           | 143 ++++-----------
 src/intel/compiler/jay/jay_opcodes.py      |  14 +-
 src/intel/compiler/jay/jay_opt_propagate.c |  30 ++++
 src/intel/compiler/jay/jay_private.h       |   4 +-
 src/intel/compiler/jay/jay_schedule.c      |  12 +-
 src/intel/compiler/jay/jay_simd_width.c    |   1 +
 src/intel/compiler/jay/jay_to_binary.c     |  13 +-
 src/intel/compiler/jay/jay_validate.c      |  12 +-
 src/intel/compiler/jay/meson.build         |   1 +
 16 files changed, 405 insertions(+), 178 deletions(-)
 create mode 100644 src/intel/compiler/jay/jay_lower_helpers.c

diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c
index ba29f2f041a..052dd3187f8 100644
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@@ -1039,7 +1039,6 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
    case nir_intrinsic_dpas_intel:
    case nir_intrinsic_convert_cmat_intel:
    case nir_intrinsic_load_coverage_mask_intel:
-   case nir_intrinsic_load_dispatch_mask_intel:
    case nir_intrinsic_isberd_nv:
    case nir_intrinsic_isbewr_nv:
    case nir_intrinsic_vild_nv:
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 33b90a5dfbc..afbce7a24a2 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -2686,9 +2686,6 @@ system_value("simd_width_intel", 1)
 # IndirectDataStartAddress
 system_value("indirect_address_intel", 1)
 
-# The dispatch mask as provided in the FS payload.
-system_value("dispatch_mask_intel", 1)
-
 # The raw coverage mask as provided in the FS payload.
 # The semantics of it depend on the HW state.
 system_value("coverage_mask_intel", 1)
@@ -2704,8 +2701,8 @@ intrinsic("load_reloc_const_intel", dest_comp=1, bit_sizes=[32],
           indices=[PARAM_IDX, BASE], flags=[CAN_ELIMINATE, CAN_REORDER])
 
 # Write a render target
-# src[] = { color, dual_color, src0_alpha, omask, depth, stencil, predicate }
-intrinsic("store_render_target_intel", [4, 4, 1, 1, 1, 1, 1], indices=[TARGET], bit_sizes=[32, 32, 32, 32, 32, 32, 1])
+# src[] = { color, dual_color, src0_alpha, omask, depth, stencil }
+intrinsic("store_render_target_intel", [4, 4, 1, 1, 1, 1], indices=[TARGET], bit_sizes=[32, 32, 32, 32, 32, 32])
 
 # Shuffle with an offset in bytes instead of a lane index.
 # src[] = { payload, lane offset in bytes }
diff --git a/src/intel/compiler/jay/jay_assign_flags.c b/src/intel/compiler/jay/jay_assign_flags.c
index 35b20d310f8..67180ea3af1 100644
--- a/src/intel/compiler/jay/jay_assign_flags.c
+++ b/src/intel/compiler/jay/jay_assign_flags.c
@@ -72,10 +72,15 @@ assign_flag(struct flag_ra *ra,
    jay_def tmp = jay_alloc_def(ra->b, file, 1);
 
    unsigned num_flags = jay_num_regs(ra->b->shader, FLAG);
+   if (ra->b->shader->helpers_tracked) {
+      /* Helper tracking uses the last flag by definition */
+      num_flags--;
+   }
+
    tmp.reg = tie ? tie->reg : ballot ? 0 : ((ra->roundrobin++) % num_flags);
 
    /* Uniform access (via a UFLAG or an inverse-ballot) would clobber the zero
-    * for a ballot. We could refine this further but this should be ok for now.
+    * for a ballot. TODO: This needs to be reworked to get the flag back.
     */
    if (!ballot &&
        tmp.reg == 0 &&
@@ -84,6 +89,8 @@ assign_flag(struct flag_ra *ra,
       assert(!tie);
       tmp.reg = 1;
       ra->roundrobin++;
+
+      assert(num_flags >= 2); /* XXX: Not always true, FIXME */
    }
 
    if (jay_index(canonical) < ra->nr_vars) {
@@ -193,6 +200,16 @@ assign_block(struct flag_ra *ra)
          I->type = JAY_TYPE_U32;
          I->dst = canonicalize_flag(I->dst);
          continue;
+      } else if (I->op == JAY_OPCODE_SEND &&
+                 jay_send_skip_helpers(I) &&
+                 jay_is_no_mask(I)) {
+
+         /* jay_lower_helpers will clobber flag 0 to handle this case, see the
+          * logic there. Evict whatever was there.
+          */
+         ra->flag_to_global[0] = 0;
+         assert(!I->predication);
+         continue;
       } else if (I->type == JAY_TYPE_U1) {
          /* Boolean logic turns into bitwise logic on the canonical form */
          if (!jay_is_null(I->dst)) {
diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c
index 67a61f54bad..cb3dbe1fae1 100644
--- a/src/intel/compiler/jay/jay_from_nir.c
+++ b/src/intel/compiler/jay/jay_from_nir.c
@@ -8,12 +8,14 @@
 #include "compiler/brw/brw_eu_defines.h"
 #include "compiler/brw/brw_nir.h"
 #include "compiler/brw/brw_sampler.h"
+#include "compiler/gen/gen_enums.h"
 #include "compiler/intel_nir.h"
 #include "compiler/intel_shader_enums.h"
 #include "compiler/list.h"
 #include "intel/dev/intel_debug.h"
 #include "mda/debug_archiver.h"
 #include "util/bitscan.h"
+#include "util/bitset.h"
 #include "util/lut.h"
 #include "util/macros.h"
 #include "util/u_math.h"
@@ -81,10 +83,7 @@ struct nir_to_jay_state {
    const struct intel_device_info *devinfo;
 
    jay_builder bld;
-
-   jay_block *current_block;
-   jay_block *after_block;
-   jay_block *break_block;
+   jay_block *current_block, *after_block, *break_block, *exit_block;
 
    unsigned indent;
    bool needs_final_halt;
@@ -832,19 +831,6 @@ scalars_equal(nir_scalar a, nir_scalar b)
            nir_scalar_as_uint(a) == nir_scalar_as_uint(b));
 }
 
-static void
-jay_emit_halt_target(struct nir_to_jay_state *nj)
-{
-   /* This final halt will re-enable the channels which got masked off by first
-    * HALT.
-    */
-   if (nj->needs_final_halt) {
-      /* This avoids re-emitting the halt after EOT send */
-      nj->needs_final_halt = false;
-      jay_HALT_TARGET(&nj->bld);
-   }
-}
-
 static void
 jay_emit_fb_write(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
 {
@@ -860,8 +846,6 @@ jay_emit_fb_write(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
    const int target = MAX2(((signed) nir_intrinsic_target(intr)), 0);
    const bool last = !nir_instr_next(&intr->instr);
 
-   jay_emit_halt_target(nj);
-
    /* The hardware freaks out if we give it an omask without multisampling. */
    if (!b->shader->prog_data->fs.uses_omask) {
       omask = jay_null();
@@ -941,15 +925,10 @@ jay_emit_fb_write(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
          srcs[len++] = jay_extract(packed, i);
    }
 
-   jay_inst *send =
-      jay_SEND(b, .sfid = GEN_SFID_RENDER_CACHE, .check_tdr = true,
-               .msg_desc = desc | (ex_desc << 32), .srcs = srcs, .nr_srcs = len,
-               .type = JAY_TYPE_U32, .eot = last, .split = split);
-
-   /* Handle the disable predicate. It is logically inverted. */
-   if (!nir_src_is_zero(intr->src[6])) {
-      jay_add_predicate(b, send, jay_negate(nj_src(intr->src[6])));
-   }
+   jay_SEND(b, .sfid = GEN_SFID_RENDER_CACHE, .check_tdr = true,
+            .msg_desc = desc | (ex_desc << 32), .srcs = srcs, .nr_srcs = len,
+            .type = JAY_TYPE_U32, .eot = last, .split = split,
+            .skip_helpers = true);
 }
 
 static enum lsc_data_size
@@ -1572,19 +1551,6 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
       jay_MOV(b, dst, fs->coverage_mask);
       break;
 
-   case nir_intrinsic_load_dispatch_mask_intel: {
-      jay_def mask = jay_extract(nj->payload.u0, 15);
-
-      if (nj->s->dispatch_width == 32) {
-         /* TODO: Optimize */
-         jay_def hi = jay_extract(nj->payload.u1, 15);
-         mask = jay_BFI2_u32(b, 0xffff0000, hi, mask);
-      }
-
-      jay_MOV(b, dst, mask);
-      break;
-   }
-
    case nir_intrinsic_load_subgroup_invocation: {
       jay_def lid = jay_alloc_def(b, UGPR, s->dispatch_width / 2);
       jay_LANE_ID_8(b, jay_extract_range(lid, 0, 4));
@@ -1600,8 +1566,16 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
    }
 
    case nir_intrinsic_demote:
+      jay_DEMOTE_u32(b, jay_null(), jay_null());
+      break;
    case nir_intrinsic_demote_if:
-      /* TODO: Already lowered, but need to implement for performance. */
+      jay_DEMOTE(b, JAY_TYPE_U1, nj_src(intr->src[0]), 0)->conditional_mod =
+         GEN_CONDITION_NE;
+      break;
+
+   case nir_intrinsic_load_helper_invocation:
+   case nir_intrinsic_is_helper_invocation:
+      jay_HELPER_SEL(b, dst, 1, 0);
       break;
 
    case nir_intrinsic_ddx:
@@ -2455,7 +2429,8 @@ jay_emit_texture(struct nir_to_jay_state *nj, nir_tex_instr *tex)
             .ex_desc = desc_ex_src, .header = header, .srcs = payload,
             .nr_srcs = n_sources, .type = JAY_TYPE_U32,
             .src_type = { src_type }, .dst = tmp, .uniform = payload_uniform,
-            .bindless = surface_bindless, .pure = true);
+            .bindless = surface_bindless, .pure = true,
+            .skip_helpers = tex->skip_helpers);
 
    /* If we sampled into a temporary, copy out to the final */
    if (residency) {
@@ -2484,7 +2459,8 @@ jay_emit_jump(struct nir_to_jay_state *nj, nir_jump_instr *instr)
       break;
    case nir_jump_halt:
       nj->needs_final_halt = true;
-      jay_HALT(&nj->bld);
+      jay_block_add_successor(nj->current_block, nj->exit_block, GPR);
+      jay_HALT(&nj->bld, false);
       break;
    case nir_jump_return:
       /* Should be lowered */
@@ -2754,8 +2730,16 @@ static void
 jay_emit_eot(struct nir_to_jay_state *nj)
 {
    jay_builder *b = &nj->bld;
+   b->cursor = jay_after_block(nj->exit_block);
 
-   jay_emit_halt_target(nj);
+   /* Jump target for HALT */
+   if (nj->needs_final_halt) {
+      if (nj->s->stage == MESA_SHADER_FRAGMENT) {
+         assert(nj->s->helpers_tracked);
+      } else {
+         jay_HALT_TARGET(&nj->bld);
+      }
+   }
 
    if (mesa_shader_stage_is_compute(nj->nir->info.stage)) {
       jay_def u0 = nj->payload.u0;
@@ -2773,12 +2757,18 @@ jay_emit_eot(struct nir_to_jay_state *nj)
                .uniform = true);
    } else if (nj->nir->info.stage == MESA_SHADER_VERTEX ||
               nj->nir->info.stage == MESA_SHADER_TESS_EVAL) {
-      jay_block *block = jay_last_block(nj->f);
+      jay_block *block = jay_last_source_block(nj->f);
       jay_inst *I = jay_last_inst(block);
 
+      assert(!nj->needs_final_halt && "halt not supported with URB");
+
       /* TODO: What if this isn't the case? Do we need a no-op store...? */
       assert(I && I->op == JAY_OPCODE_SEND && jay_send_sfid(I) == GEN_SFID_URB);
+
+      /* Pluck out the final SEND and put it in the exit block */
       jay_set_send_eot(I, true);
+      jay_remove_instruction(I);
+      jay_builder_insert(b, I);
    }
 }
 
@@ -3012,6 +3002,14 @@ setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
       }
    }
 
+   /* INIT_HELPERS reads UGPRs but has no SSA write. Therefore to minimize
+    * pressure, we want to hoist it as much as possible.
+    */
+   if (nj->s->helpers_tracked) {
+      jay_INIT_HELPERS(&nj->bld, jay_extract(nj->payload.u0, 15),
+                       payload_u1(nj, 15, 1));
+   }
+
    for (unsigned i = 0; i < ARRAY_SIZE(split_gprs); ++i) {
       if (!jay_is_null(split[i]) && split_gprs[i].def->file == UGPR) {
          *(split_gprs[i].def) =
@@ -3178,7 +3176,11 @@ jay_from_nir_function(const struct intel_device_info *devinfo,
       jay_setup_payload(&nj);
    }
 
+   nj.exit_block = jay_create_block(&nj);
    jay_emit_cf_list(&nj, &impl->body);
+   jay_block_add_successor(nj.current_block, nj.exit_block, GPR);
+
+   list_addtail(&nj.exit_block->link, &f->blocks);
    jay_emit_eot(&nj);
    jay_remove_unreachable_blocks(f);
 }
@@ -3216,8 +3218,9 @@ jay_compile(const struct intel_device_info *devinfo,
       INTEL_DEBUG(intel_debug_flag_for_shader_stage(nir->info.stage)) &&
       !(nir->info.internal || NIR_DEBUG(PRINT_INTERNAL));
 
+   bool track_helpers = false;
    unsigned simd_width =
-      jay_process_nir(devinfo, nir, prog_data, key, archiver);
+      jay_process_nir(devinfo, nir, prog_data, key, archiver, &track_helpers);
 
    if (debug) {
       /* We can't use nir_print_shader since it reindexes SSA defs. */
@@ -3232,6 +3235,7 @@ jay_compile(const struct intel_device_info *devinfo,
    s->devinfo = devinfo;
    s->prog_data = prog_data;
    s->archiver = archiver;
+   s->helpers_tracked = track_helpers;
 
    nir_foreach_function_impl(impl, nir) {
       jay_from_nir_function(devinfo, nir, s, impl);
@@ -3288,6 +3292,10 @@ jay_compile(const struct intel_device_info *devinfo,
       JAY_PASS(s, jay_insert_payload_swizzle);
    }
 
+   if (s->stage == MESA_SHADER_FRAGMENT && s->helpers_tracked) {
+      JAY_PASS(s, jay_lower_helpers);
+   }
+
    if (!(jay_debug & JAY_DBG_NOOPT)) {
       /* jay_assign_accumulators uses a conservative liveness analysis for
        * predication, so assign accumulators before predicating for better
diff --git a/src/intel/compiler/jay/jay_ir.h b/src/intel/compiler/jay/jay_ir.h
index 624fcb20358..f939f7bd771 100644
--- a/src/intel/compiler/jay/jay_ir.h
+++ b/src/intel/compiler/jay/jay_ir.h
@@ -787,6 +787,12 @@ typedef struct jay_shader {
    unsigned scratch_size;
    unsigned payload_gprs, payload_ugprs, push_grfs;
 
+   /**
+    * In a fragment shader, whether a helper invocation flag is tracked. Flag RA
+    * must reserve the relevant flag.
+    */
+   bool helpers_tracked;
+
    /**
     * Ralloc linear context. Since we don't typically free as we go,
     * most allocations should go through this context for efficiency.
@@ -1126,7 +1132,7 @@ jay_new_block(jay_function *f)
 static inline bool
 jay_op_is_control_flow(enum jay_opcode op)
 {
-   return op >= JAY_OPCODE_BRD && op <= JAY_OPCODE_LOOP_ONCE;
+   return op >= JAY_OPCODE_BRD && op <= JAY_OPCODE_HALT;
 }
 
 /**
@@ -1201,6 +1207,9 @@ jay_first_predecessor(jay_block *block, enum jay_file file)
 #define jay_foreach_block_rev(f, v)                                            \
    list_for_each_entry_rev(jay_block, v, &f->blocks, link)
 
+#define jay_foreach_block_safe_rev(f, v)                                       \
+   list_for_each_entry_safe_rev(jay_block, v, &f->blocks, link)
+
 #define jay_foreach_block_from(f, from, v)                                     \
    list_for_each_entry_from(jay_block, v, from, &f->blocks, link)
 
@@ -1238,7 +1247,7 @@ jay_first_predecessor(jay_block *block, enum jay_file file)
       jay_foreach_inst_in_block_safe(block, v)
 
 #define jay_foreach_inst_in_func_safe_rev(func, block, v)                      \
-   jay_foreach_block_rev(func, block)                                          \
+   jay_foreach_block_safe_rev(func, block)                                     \
       jay_foreach_inst_in_block_safe_rev(block, v)
 
 #define jay_foreach_inst_in_shader(s, func, inst)                              \
@@ -1355,6 +1364,15 @@ jay_last_block(jay_function *f)
       return list_last_entry(&f->blocks, jay_block, link);
 }
 
+static inline jay_block *
+jay_last_source_block(jay_function *f)
+{
+   if (list_is_empty(&f->blocks) || list_is_singular(&f->blocks))
+      return NULL;
+   else
+      return list_last_entry(&jay_last_block(f)->link, jay_block, link);
+}
+
 static inline jay_inst *
 jay_last_inst(jay_block *block)
 {
@@ -1373,11 +1391,14 @@ jay_next_block(jay_block *block)
 static inline void
 jay_block_add_successor(jay_block *block, jay_block *succ, enum jay_file file)
 {
+   /* Prune duplicate successors so the caller doesn't need to worry */
    jay_block **succs = jay_successors(block, file);
-   unsigned i = succs[0] ? 1 : 0;
+   if (succs[0] == succ || succs[1] == succ) {
+      return;
+   }
 
-   assert(succ && succs[0] != succ && succs[1] != succ);
-   assert(succs[i] == NULL && "at most 2 successors");
+   unsigned i = succs[0] ? 1 : 0;
+   assert(succ && succs[i] == NULL && "at most 2 successors");
 
    succs[i] = succ;
    util_dynarray_append(jay_predecessors(succ, file), block);
diff --git a/src/intel/compiler/jay/jay_lower_helpers.c b/src/intel/compiler/jay/jay_lower_helpers.c
new file mode 100644
index 00000000000..c8423dcc1aa
--- /dev/null
+++ b/src/intel/compiler/jay/jay_lower_helpers.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#include "compiler/gen/gen_enums.h"
+#include "util/list.h"
+#include "util/u_dynarray.h"
+#include "jay_builder.h"
+#include "jay_builder_opcodes.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+struct ctx {
+   jay_block *last_source_block;
+   jay_def helper_flag;
+   bool halted, uses_terminate;
+   unsigned instr_left;
+};
+
+/*
+ * Takes src, a linked list containing the element pivot in the middle, and dst
+ * an empty list. Moves all elements up to and including pivot from src to dst,
+ * leaving the rest in dst. Semantically equivalent to a loop of list_move but
+ * O(1) time regardless of the position of pivot in the list.
+ */
+static void
+list_partition(struct list_head *src,
+               struct list_head *dst,
+               struct list_head *pivot)
+{
+   /* dst runs from src[0] to pivot */
+   dst->next = src->next;
+   dst->prev = pivot;
+   dst->next->prev = dst;
+
+   /* src runs from pivot[1:] to end of src */
+   src->next = pivot->next;
+   src->prev = src->prev;
+
+   src->next->prev = src;
+   pivot->next = dst;
+
+   list_validate(dst);
+   list_validate(src);
+}
+
+static void
+process_block(struct ctx *ctx, jay_builder *b, jay_block *block)
+{
+   jay_foreach_inst_in_block_safe_rev(block, I) {
+      b->cursor = jay_before_inst(I);
+
+      if (I->op == JAY_OPCODE_INIT_HELPERS) {
+         jay_NOT(b, ctx->helper_flag, I->src[0])->type = JAY_TYPE_U16;
+
+         if (!jay_is_null(I->src[1])) {
+            jay_def hi = ctx->helper_flag;
+            hi.hi = true;
+            jay_NOT(b, hi, I->src[1])->type = JAY_TYPE_U16;
+         }
+
+         jay_remove_instruction(I);
+      } else if (I->op == JAY_OPCODE_HALT) {
+         ctx->halted = ctx->uses_terminate = true;
+      } else if (I->op == JAY_OPCODE_DEMOTE) {
+         enum gen_condition cond = I->conditional_mod;
+         jay_def x = I->src[0], y = I->src[1];
+
+         /* Unconditional discard */
+         if (!cond) {
+            cond = GEN_CONDITION_EQ;
+            I->type = JAY_TYPE_U32;
+            x = y = jay_bare_reg(UGPR, 0);
+         }
+
+         jay_inst *cmp = jay_CMP(b, I->type, cond, ctx->helper_flag, x, y);
+         jay_add_predicate(b, cmp, jay_negate(ctx->helper_flag));
+         jay_remove_instruction(I);
+
+         /* We are allowed to halt after a demote if all lanes are inactive
+          * for performance, but it's not required for correctness. Only do
+          * it if it's likely profitable.
+          *
+          * We assume a shader either uses SPIR-V demote or terminate, but
+          * not both. If the shader uses terminate, there will be an actual
+          * HALT instruction after us so we don't bother with a second HALT
+          * here. Strictly there's a corner case here if all non-helpers are
+          * terminated but lanes spawned as helpers are not terminated, but
+          * this is probably reasonable as a tradeoff.
+          */
+         if (ctx->instr_left > 6 && !ctx->uses_terminate) {
+            jay_inst *halt = jay_HALT(b, true);
+            halt = jay_add_predicate(b, halt, ctx->helper_flag);
+            ctx->halted = true;
+
+            jay_block *split = jay_new_block(b->func);
+            split->indent = block->indent;
+
+            list_partition(&block->instructions, &split->instructions,
+                           &halt->link);
+            list_addtail(&split->link, &block->link);
+
+            /* The split block either falls through or jumps to the exit */
+            for (unsigned file = GPR; file <= UGPR; ++file) {
+               jay_foreach_predecessor(block, pred, file) {
+                  jay_block **succs = jay_successors(*pred, file);
+                  unsigned idx = succs[0] == block ? 0 : 1;
+                  succs[idx] = split;
+               }
+            }
+            typed_memcpy(&split->physical_preds, &block->physical_preds, 1);
+            typed_memcpy(&split->logical_preds, &block->logical_preds, 1);
+            util_dynarray_init(&block->physical_preds, block);
+            util_dynarray_init(&block->logical_preds, block);
+
+            jay_block_add_successor(split, block, GPR);
+            jay_block_add_successor(split, jay_last_block(b->func), GPR);
+            return;
+         }
+      } else if (I->op == JAY_OPCODE_HELPER_SEL) {
+         jay_SEL(b, JAY_TYPE_U32, I->dst, I->src[0], I->src[1],
+                 ctx->helper_flag);
+         jay_remove_instruction(I);
+      } else if (I->op == JAY_OPCODE_SEND && jay_send_skip_helpers(I)) {
+         if (jay_is_no_mask(I)) {
+            /* jay_assign_flags ensured this is free for us, see logic there */
+            jay_def t = jay_bare_reg(UFLAG, 0);
+            jay_inst *not = jay_NOT(b, jay_null(), ctx->helper_flag);
+            not->type = JAY_TYPE_U | b->shader->dispatch_width;
+            jay_set_conditional_mod(b, not, t, GEN_CONDITION_NE);
+            jay_add_predicate(b, I, t);
+         } else {
+            jay_add_predicate(b, I, jay_negate(ctx->helper_flag));
+         }
+      }
+
+      ++ctx->instr_left;
+   }
+}
+
+void
+jay_lower_helpers(jay_shader *shader)
+{
+   jay_function *entrypoint = jay_shader_get_entrypoint(shader);
+   jay_block *exit_block = jay_last_block(entrypoint);
+   jay_block *last_source_block = jay_last_source_block(entrypoint);
+
+   /* By ABI with jay_assign_flags, the last flag is used to track helpers */
+   assert(shader->helpers_tracked);
+   unsigned helper_flag_no = jay_num_regs(shader, FLAG) - 1;
+   struct ctx ctx = { .helper_flag = jay_bare_reg(FLAG, helper_flag_no) };
+   jay_builder b = jay_init_builder(entrypoint, jay_after_block(exit_block));
+
+   jay_foreach_block_rev(entrypoint, block) {
+      process_block(&ctx, &b, block);
+   }
+
+   /* Fill out the exit block */
+   b.cursor = jay_after_block(exit_block);
+   if (ctx.halted) {
+      jay_HALT_TARGET(&b);
+   }
+
+   /* Try to pluck out the last instruction and use it for EOT. This breaks SSA
+    * dominance invariants but that's why this is a post-RA, post-sched pass.
+    * Only SWSB has to deal with the resulting mess.
+    *
+    * There may be no such send (in case of an unconditional terminate). In that
+    * case, insert a predicated-out null RT write to use for EOT.
+    */
+   jay_inst *send = jay_last_inst(last_source_block);
+   if (send && send->op == JAY_OPCODE_SEND && jay_send_eot(send)) {
+      jay_remove_instruction(send);
+      jay_builder_insert(&b, send);
+   } else {
+      jay_def dummy = jay_bare_reg(GPR, 0);
+      dummy.num_values_m1 = 4 - 1;
+
+      unsigned op = shader->dispatch_width == 32 ?
+                       XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE :
+                       BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
+      uint64_t desc = brw_fb_write_desc(shader->devinfo, 0, op, true, false);
+      uint64_t ex_desc = (1 << 20) /* null rt */;
+
+      send = jay_SEND(&b, .sfid = GEN_SFID_RENDER_CACHE, .check_tdr = true,
+                      .msg_desc = desc | (ex_desc << 32), .nr_srcs = 1,
+                      .srcs = &dummy, .type = JAY_TYPE_U32, .eot = true);
+      send = jay_add_predicate(&b, send, jay_negate(ctx.helper_flag));
+   }
+}
diff --git a/src/intel/compiler/jay/jay_lower_pre_ra.c b/src/intel/compiler/jay/jay_lower_pre_ra.c
index dae14383201..f2771a85305 100644
--- a/src/intel/compiler/jay/jay_lower_pre_ra.c
+++ b/src/intel/compiler/jay/jay_lower_pre_ra.c
@@ -96,7 +96,7 @@ try_swap_src01(jay_inst *I)
    if (I->op == JAY_OPCODE_SEL) {
       /* sel(a, b, p) = sel(b, a, !p) */
       I->src[2].negate ^= true;
-   } else if (I->op == JAY_OPCODE_CMP) {
+   } else if (I->op == JAY_OPCODE_CMP || I->op == JAY_OPCODE_DEMOTE) {
       I->conditional_mod = gen_condition_swap_sources(I->conditional_mod);
    } else if (I->op == JAY_OPCODE_BFN) {
       jay_set_bfn_ctrl(I, util_lut3_swap_sources(jay_bfn_ctrl(I), 0, 1));
diff --git a/src/intel/compiler/jay/jay_nir.c b/src/intel/compiler/jay/jay_nir.c
index 1e8753bce78..3c23cd86849 100644
--- a/src/intel/compiler/jay/jay_nir.c
+++ b/src/intel/compiler/jay/jay_nir.c
@@ -46,21 +46,6 @@ nj_index_ssa_defs(nir_shader *nir)
    }
 }
 
-static bool
-lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_)
-{
-   if (intr->intrinsic != nir_intrinsic_load_helper_invocation)
-      return false;
-
-   /* TODO: Is this right for multisampling? */
-   b->cursor = nir_before_instr(&intr->instr);
-   nir_def *active =
-      nir_inot(b, nir_inverse_ballot(b, nir_load_dispatch_mask_intel(b)));
-
-   nir_def_replace(&intr->def, active);
-   return true;
-}
-
 static bool
 lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
 {
@@ -178,8 +163,7 @@ insert_rt_store(nir_builder *b,
                 nir_def *src0_colour,
                 nir_def *depth,
                 nir_def *stencil,
-                nir_def *sample_mask,
-                nir_def *disable)
+                nir_def *sample_mask)
 {
    bool null_rt = target < 0;
 
@@ -197,8 +181,7 @@ insert_rt_store(nir_builder *b,
    nir_def *src0_alpha = nir_channel_or_undef(b, src0_colour ?: colour, 3);
 
    nir_store_render_target_intel(b, colour, dual_colour, src0_alpha,
-                                 sample_mask, depth, stencil, disable,
-                                 .target = target);
+                                 sample_mask, depth, stencil, .target = target);
 }
 
 static void
@@ -216,14 +199,10 @@ lower_fragment_outputs(nir_function_impl *impl,
 
    nir_def *undef = nir_undef(b, 1, 32);
 
-   nir_def *disable = b->shader->info.fs.uses_discard ?
-                         nir_is_helper_invocation(b, 1) :
-                         nir_imm_false(b);
-
    if (ctx.dual_blend) {
       insert_rt_store(b, 0, ctx.colour[0], ctx.colour[1], NULL,
                       ctx.depth ?: undef, ctx.stencil ?: undef,
-                      ctx.sample_mask ?: undef, disable);
+                      ctx.sample_mask ?: undef);
       return;
    }
 
@@ -239,83 +218,13 @@ lower_fragment_outputs(nir_function_impl *impl,
       if (ctx.colour[i]) {
          insert_rt_store(b, i, ctx.colour[i], NULL,
                          i > 0 ? ctx.colour[0] : NULL, ctx.depth ?: undef,
-                         ctx.stencil ?: undef, ctx.sample_mask ?: undef,
-                         disable);
+                         ctx.stencil ?: undef, ctx.sample_mask ?: undef);
       }
    }
 
    insert_rt_store(b, last, last >= 0 ? ctx.colour[last] : NULL, NULL,
                    last > 0 ? ctx.colour[0] : NULL, ctx.depth ?: undef,
-                   ctx.stencil ?: undef, ctx.sample_mask ?: undef, disable);
-}
-
-/**
- * Drop render target stores with unconditional discards.
- */
-static bool
-opt_unconditional_discards(nir_shader *nir)
-{
-   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
-   nir_block *block = nir_impl_last_block(impl);
-
-   bool progress = false;
-   bool any_remaining_rt_writes = false;
-
-   nir_foreach_instr_reverse_safe(instr, block) {
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-
-      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-      if (intr->intrinsic == nir_intrinsic_store_render_target_intel) {
-         nir_scalar discard = nir_scalar_resolved(intr->src[6].ssa, 0);
-         if (nir_scalar_is_const(discard) && nir_scalar_as_bool(discard)) {
-            /* Drop store with unconditional discard */
-            nir_instr_remove(instr);
-            progress = true;
-         } else {
-            /* This RT store might actually happen */
-            any_remaining_rt_writes = true;
-         }
-      } else if ((intr->intrinsic == nir_intrinsic_demote ||
-                  intr->intrinsic == nir_intrinsic_terminate) &&
-                 !any_remaining_rt_writes) {
-         /* Delete unconditional demotes/terminates in the end block... */
-         nir_instr_remove(instr);
-         progress = true;
-      } else {
-         /* ...but stop if we find an intrinsic that has a side-effect */
-         const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
-         if (!(info->flags & NIR_INTRINSIC_CAN_ELIMINATE))
-            break;
-      }
-   }
-
-   /* See if discards still exist in the program and flag accordingly */
-   nir->info.fs.uses_discard = false;
-
-   nir_foreach_block(block, impl) {
-      nir_foreach_instr(instr, block) {
-         if (instr->type == nir_instr_type_intrinsic) {
-            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-            if (intr->intrinsic == nir_intrinsic_demote ||
-                intr->intrinsic == nir_intrinsic_demote_if ||
-                intr->intrinsic == nir_intrinsic_terminate ||
-                intr->intrinsic == nir_intrinsic_terminate_if)
-               nir->info.fs.uses_discard = true;
-         }
-      }
-   }
-
-   /* If we eliminated all RT stores, add a Null RT store to end the thread. */
-   if (!any_remaining_rt_writes) {
-      nir_builder b = nir_builder_at(nir_after_impl(impl));
-      nir_def *undef = nir_undef(&b, 1, 32);
-      insert_rt_store(&b, -1, NULL, NULL, NULL, undef, undef, undef,
-                      nir_imm_true(&b));
-   }
-
-   return nir_progress(progress, impl, nir_metadata_control_flow);
+                   ctx.stencil ?: undef, ctx.sample_mask ?: undef);
 }
 
 unsigned
@@ -323,7 +232,8 @@ jay_process_nir(const struct intel_device_info *devinfo,
                 nir_shader *nir,
                 union brw_any_prog_data *prog_data,
                 union brw_any_prog_key *key,
-                debug_archiver *archiver)
+                debug_archiver *archiver,
+                bool *track_helpers)
 {
    enum mesa_shader_stage stage = nir->info.stage;
    struct brw_compiler compiler = { .devinfo = devinfo };
@@ -475,10 +385,14 @@ jay_process_nir(const struct intel_device_info *devinfo,
 
       lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo,
                              key->fs.nr_color_regions, simd_width);
-      JAY_NIR_PASS(nir_lower_helper_writes, true);
-      JAY_NIR_PASS(nir_lower_is_helper_invocation);
-      JAY_NIR_PASS(nir_shader_intrinsics_pass, lower_helper_invocation,
-                   nir_metadata_control_flow, NULL);
+
+      /* nir_lower_terminate_to_demote will hamper our ability to schedule
+       * terminates (since it turns them into real control flow), so run
+       * nir_opt_move_discards_to_top first as a prepass. That should help
+       * scheduling demotes too (which is more important).
+       */
+      JAY_NIR_PASS(nir_opt_move_discards_to_top);
+      JAY_NIR_PASS(nir_lower_terminate_to_demote);
 
       if (key->fs.alpha_to_coverage != INTEL_NEVER) {
          /* Run constant fold optimization in order to get the correct source
@@ -495,8 +409,6 @@ jay_process_nir(const struct intel_device_info *devinfo,
        */
       brw_nir_optimize(pt);
 
-      NIR_PASS(_, nir, opt_unconditional_discards);
-
       // TODO
       // JAY_NIR_PASS(brw_nir_move_interpolation_to_top);
 
@@ -556,10 +468,31 @@ jay_process_nir(const struct intel_device_info *devinfo,
 
    /* Run divergence analysis at the end */
    nir_sweep(nir);
-   nj_index_ssa_defs(nir);
    nir_divergence_analysis(nir);
 
-   if (stage != MESA_SHADER_FRAGMENT)
+   if (stage == MESA_SHADER_FRAGMENT) {
+      /* Certain features require tracking helpers for correctness */
+      nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+      *track_helpers |= nir->info.fs.uses_discard || nir->info.writes_memory;
+      *track_helpers |= BITSET_TEST(nir->info.system_values_read,
+                                    SYSTEM_VALUE_HELPER_INVOCATION);
+
+      /* ...but this is more subtle. nir_opt_load_skip_helpers flags texturing
+       * operations that we can skip for bandwidth savings.  We need divergence
+       * info for this, so we run late.
+       *
+       * We may or may not want to force track_helpers on if this makes
+       * progress. Possibly driconf'ing on furmark makes sense.
+       */
+      struct nir_opt_load_skip_helpers_options skip_helpers = {
+         .no_add_divergence = true
+      };
+      JAY_NIR_PASS(nir_opt_load_skip_helpers, &skip_helpers);
+   } else {
       jay_populate_prog_data(devinfo, nir, prog_data, key, nr_packed_regs);
+   }
+
+   /* This must be the very last pass since nir_print itself will reindex! */
+   nj_index_ssa_defs(nir);
    return simd_width;
 }
diff --git a/src/intel/compiler/jay/jay_opcodes.py b/src/intel/compiler/jay/jay_opcodes.py
index 2d19591fe74..c09adbe9b31 100644
--- a/src/intel/compiler/jay/jay_opcodes.py
+++ b/src/intel/compiler/jay/jay_opcodes.py
@@ -81,7 +81,6 @@ op('bfrev', 1, 'u32', Props.NEGATE)
 op('cbit',  1, 'u32')
 op('cmp',   2, 'u32', Props.NEGATE | Props.CMOD)
 
-
 # With an 8/16-bit type, `index` specifies the element index of the source
 # within the 32-bit word. For example, if src_type == U16 and index == 1, this
 # converts the upper 16-bits of the input.
@@ -134,9 +133,11 @@ op('schedule_barrier', 0, None, Props.NO_DEST)
 
 for n in ['brd', 'illegal', 'goto', 'join', 'if', 'else',
           'endif', 'while', 'break', 'cont', 'call', 'calla', 'jmpi', 'ret',
-          'loop_once', 'halt', 'halt_target']:
+          'loop_once', 'halt_target']:
     op(n, 0, None, Props.NO_DEST)
 
+op('halt', 0, None, Props.NO_DEST, ['bool predicate_all'])
+
 op('send', 4, None, Props.SIDE_EFFECTS, [
     'gen_sfid sfid',
     'uint8_t sbid',
@@ -234,6 +235,15 @@ op('dpas', 3, 'u32', 0, [
     'uint8_t pad[3]',
 ])
 
+# Initialize helper invocations. Takes 16-bit halves of the dispatch mask.
+op('init_helpers', 2, 'u16', Props.NO_DEST)
+
+# Compare the arguments and demote based on the result.
+op('demote', 2, 'u1 u16 u32 u64 s16 s32 s64 f16 f32 f64', Props.NEGATE | Props.NO_DEST)
+
+# Equivalent to NIR bcsel(@is_helper_invocation, source 0, source 1)
+op('helper_sel', 2, 'u1 u32')
+
 OPCODES = _opcodes
 
 ENUMS: 'Mapping[str, tuple[str, list[str]]]' = {
diff --git a/src/intel/compiler/jay/jay_opt_propagate.c b/src/intel/compiler/jay/jay_opt_propagate.c
index 0172c15ad21..4aeaf2b6120 100644
--- a/src/intel/compiler/jay/jay_opt_propagate.c
+++ b/src/intel/compiler/jay/jay_opt_propagate.c
@@ -3,6 +3,7 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "compiler/gen/gen_enums.h"
 #include "util/bitset.h"
 #include "util/lut.h"
 #include "jay_builder.h"
@@ -121,6 +122,30 @@ propagate_not(jay_inst *I, unsigned s, jay_inst *mod)
    }
 }
 
+/**
+ * Fuse demote(cmp(x, y) != 0) to demote(x CMP y).
+ */
+static void
+fuse_demote(jay_inst *demote, jay_inst **defs)
+{
+   if (!(jay_is_ssa(demote->src[0]) &&
+         jay_is_zero(demote->src[1]) &&
+         demote->type == JAY_TYPE_U1 &&
+         demote->conditional_mod == GEN_CONDITION_NE)) {
+      return;
+   }
+
+   jay_inst *cmp = defs[jay_index(demote->src[0])];
+   if (cmp->op != JAY_OPCODE_CMP || cmp->predication) {
+      return;
+   }
+
+   demote->conditional_mod = cmp->conditional_mod;
+   demote->src[0] = cmp->src[0];
+   demote->src[1] = cmp->src[1];
+   demote->type = cmp->type;
+}
+
 static void
 propagate_forwards(jay_function *f)
 {
@@ -156,6 +181,11 @@ propagate_forwards(jay_function *f)
       if (I->op == JAY_OPCODE_PHI_SRC || I->op == JAY_OPCODE_SEND)
          continue;
 
+      /* We fuse demote forwards & upfront to avoid fighting cmod prop */
+      if (I->op == JAY_OPCODE_DEMOTE) {
+         fuse_demote(I, defs);
+      }
+
       jay_foreach_ssa_src(I, s) {
          /* Copy propagate whole vectors */
          jay_def src = I->src[s];
diff --git a/src/intel/compiler/jay/jay_private.h b/src/intel/compiler/jay/jay_private.h
index c94c7dac7ab..f2288552f60 100644
--- a/src/intel/compiler/jay/jay_private.h
+++ b/src/intel/compiler/jay/jay_private.h
@@ -34,7 +34,8 @@ unsigned jay_process_nir(const struct intel_device_info *devinfo,
                          nir_shader *nir,
                          union brw_any_prog_data *prog_data,
                          union brw_any_prog_key *key,
-                         debug_archiver *archiver);
+                         debug_archiver *archiver,
+                         bool *track_helpers);
 
 void jay_compute_liveness(jay_function *f);
 void jay_calculate_register_demands(jay_function *f);
@@ -84,6 +85,7 @@ void jay_schedule_pressure(jay_shader *s);
 
 void jay_lower_pre_ra(jay_shader *s);
 void jay_lower_post_ra(jay_shader *s);
+void jay_lower_helpers(jay_shader *s);
 void jay_lower_spill(jay_function *func);
 void jay_lower_simd_width(jay_shader *s);
 void jay_lower_scoreboard(jay_shader *s);
diff --git a/src/intel/compiler/jay/jay_schedule.c b/src/intel/compiler/jay/jay_schedule.c
index f5a642e3550..e206ac3ae0c 100644
--- a/src/intel/compiler/jay/jay_schedule.c
+++ b/src/intel/compiler/jay/jay_schedule.c
@@ -100,9 +100,17 @@ populate_dag(struct sched_ctx *ctx,
          address = ctx->dag.node;
       }
 
-      /* Serialize side effects for now */
+      /* Serialize side effects for now, including SENDs which need to be
+       * predicated away after a demote.
+       */
       if ((I->op == JAY_OPCODE_SEND && !jay_send_pure(I)) ||
-          I->op == JAY_OPCODE_SCHEDULE_BARRIER) {
+          I->op == JAY_OPCODE_SCHEDULE_BARRIER ||
+          I->op == JAY_OPCODE_INIT_HELPERS ||
+          I->op == JAY_OPCODE_DEMOTE ||
+          I->op == JAY_OPCODE_HELPER_SEL ||
+          (I->op == JAY_OPCODE_SEND &&
+           func->shader->helpers_tracked &&
+           jay_send_skip_helpers(I))) {
 
          jay_dag_add_edge(&ctx->dag, sidefx);
          sidefx = ctx->dag.node;
diff --git a/src/intel/compiler/jay/jay_simd_width.c b/src/intel/compiler/jay/jay_simd_width.c
index 8987a50af87..f07ec23b908 100644
--- a/src/intel/compiler/jay/jay_simd_width.c
+++ b/src/intel/compiler/jay/jay_simd_width.c
@@ -22,6 +22,7 @@ max_simd_width(const jay_shader *shader, const jay_inst *I)
        I->op == JAY_OPCODE_EXTRACT_BYTE_PER_8LANES ||
        I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS ||
        I->op == JAY_OPCODE_DESWIZZLE_ODD ||
+       I->op == JAY_OPCODE_INIT_HELPERS ||
        I->op == JAY_OPCODE_MUL_32 ||
        I->op == JAY_OPCODE_SHUFFLE ||
        I->op == JAY_OPCODE_ZIP_UGPR16) {
diff --git a/src/intel/compiler/jay/jay_to_binary.c b/src/intel/compiler/jay/jay_to_binary.c
index 435c075527a..691bf52de1f 100644
--- a/src/intel/compiler/jay/jay_to_binary.c
+++ b/src/intel/compiler/jay/jay_to_binary.c
@@ -79,7 +79,7 @@ to_gen_operand(
    gen_operand R;
    unsigned reg = d.reg, count = jay_num_values(d);
    unsigned offset_B = 0, grf = 0;
-   assert(!hi || d.file == GPR);
+   assert(!hi || d.file == GPR || d.file == FLAG);
 
    if (count && (d.file == GPR || d.file == UGPR)) {
       struct jay_register_block block =
@@ -189,7 +189,8 @@ to_gen_operand(
        * SIMD1 instructions and are never SIMD split.
        */
       assert(simd_offs == 0 || idx >= 0);
-      unsigned offs_B = d.reg * (f->shader->dispatch_width / 8);
+      unsigned offs_B =
+         (d.reg * (f->shader->dispatch_width / 8)) + (hi ? 2 : 0);
       R = gen_flag(offs_B / 2);
    } else if (d.file == J_ADDRESS) {
       R = gen_address(d.reg);
@@ -580,6 +581,14 @@ emit(struct jay_codegen *jc,
       }
       break;
 
+   case JAY_OPCODE_HALT:
+      if (jay_halt_predicate_all(I)) {
+         assert(I->predication);
+         gen->pred_control =
+            jc->devinfo->ver >= 20 ? GEN_PREDICATE_XE2_ALL : GEN_PREDICATE_ALLV;
+      }
+      break;
+
    case JAY_OPCODE_HALT_TARGET:
       /* HALT temporarily disables channels, and the same instruction is used
        * to re-enable them: once all channels are disabled, then they are
diff --git a/src/intel/compiler/jay/jay_validate.c b/src/intel/compiler/jay/jay_validate.c
index 948cbb3c564..e02620a1c8f 100644
--- a/src/intel/compiler/jay/jay_validate.c
+++ b/src/intel/compiler/jay/jay_validate.c
@@ -33,7 +33,9 @@ block_state_for_inst(jay_inst *I)
    if (I->op == JAY_OPCODE_PHI_DST || I->op == JAY_OPCODE_PRELOAD) {
       return STATE_PHI_DST;
    } else if (I->op == JAY_OPCODE_PHI_SRC ||
-              (jay_op_is_control_flow(I->op) && I->op != JAY_OPCODE_ELSE)) {
+              (jay_op_is_control_flow(I->op) &&
+               I->op != JAY_OPCODE_ELSE &&
+               I->op != JAY_OPCODE_HALT_TARGET)) {
       return STATE_LATE;
    } else {
       return STATE_NORMAL;
@@ -238,10 +240,6 @@ validate_inst(struct validate_state *validate, jay_inst *I)
    validate_flagness(validate, I->dst, I->type, "destination");
    validate_flagness(validate, I->cond_flag, JAY_TYPE_U1, "cond_flag");
 
-   CHECK(!I->conditional_mod ||
-         !jay_is_null(I->cond_flag) ||
-         I->op == JAY_OPCODE_CSEL);
-
    /* These assumptions are baked into the definition of broadcast_flag and
     * required to ensure correctness with the lane masking.
     */
@@ -256,8 +254,10 @@ validate_inst(struct validate_state *validate, jay_inst *I)
    CHECK(I->cond_flag.file != FLAG || I->dst.file != UGPR);
 
    /* Standard modifiers only allowed on some instructions */
-   CHECK(!I->conditional_mod || opinfo->cmod || I->op == JAY_OPCODE_CSEL);
    CHECK(!I->saturate || opinfo->sat);
+   CHECK(!I->conditional_mod ||
+         (I->op == JAY_OPCODE_CSEL || I->op == JAY_OPCODE_DEMOTE) ||
+         (!jay_is_null(I->cond_flag) && opinfo->cmod));
 
    unsigned num_srcs = I->num_srcs;
 
diff --git a/src/intel/compiler/jay/meson.build b/src/intel/compiler/jay/meson.build
index 949127141f6..6618569b7d1 100644
--- a/src/intel/compiler/jay/meson.build
+++ b/src/intel/compiler/jay/meson.build
@@ -54,6 +54,7 @@ libintel_compiler_jay_files = files(
   'jay_ir.h',
   'jay_insert_fp_mode.c',
   'jay_liveness.c',
+  'jay_lower_helpers.c',
   'jay_lower_post_ra.c',
   'jay_lower_pre_ra.c',
   'jay_lower_scoreboard.c',