intel/compiler: Move NIR emission code to brw_fs_nir.cpp

This is a preparation to reorganize NIR emission code. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26323>
2026-05-06 02:58:05 +02:00 · 2023-11-17 17:17:25 -08:00 · 2023-11-17 17:17:25 -08:00 · c12460b01e
commit c12460b01e
parent 1ef6415d22
4 changed files with 1238 additions and 1238 deletions
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@ -567,29 +567,6 @@ fs_reg::component_size(unsigned width) const
   return MAX2(width * stride, 1) * type_sz(type);
 }

-/**
- * Create a MOV to read the timestamp register.
- */
-fs_reg
-fs_visitor::get_timestamp(const fs_builder &bld)
-{
-   assert(devinfo->ver >= 7);
-
-   fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
-                                          BRW_ARF_TIMESTAMP,
-                                          0),
-                             BRW_REGISTER_TYPE_UD));
-
-   fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-
-   /* We want to read the 3 fields we care about even if it's not enabled in
-    * the dispatch.
-    */
-   bld.group(4, 0).exec_all().MOV(dst, ts);
-
-   return dst;
-}
-
 void
 fs_visitor::vfail(const char *format, va_list va)
 {
@ -1172,33 +1149,6 @@ fs_visitor::import_uniforms(fs_visitor *v)
   this->uniforms = v->uniforms;
 }

-void
-fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
-{
-   assert(stage == MESA_SHADER_FRAGMENT);
-
-   /* gl_FragCoord.x */
-   bld.MOV(wpos, this->pixel_x);
-   wpos = offset(wpos, bld, 1);
-
-   /* gl_FragCoord.y */
-   bld.MOV(wpos, this->pixel_y);
-   wpos = offset(wpos, bld, 1);
-
-   /* gl_FragCoord.z */
-   if (devinfo->ver >= 6) {
-      bld.MOV(wpos, this->pixel_z);
-   } else {
-      bld.emit(FS_OPCODE_LINTERP, wpos,
-               this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
-               component(interp_reg(VARYING_SLOT_POS, 2), 0));
-   }
-   wpos = offset(wpos, bld, 1);
-
-   /* gl_FragCoord.w: Already set up in emit_interpolation */
-   bld.MOV(wpos, this->wpos_w);
-}
-
 enum brw_barycentric_mode
 brw_barycentric_mode(nir_intrinsic_instr *intr)
 {
@ -1242,329 +1192,6 @@ centroid_to_pixel(enum brw_barycentric_mode bary)
   return (enum brw_barycentric_mode) ((unsigned) bary - 1);
 }

-fs_reg
-fs_visitor::emit_frontfacing_interpolation()
-{
-   fs_reg ff = bld.vgrf(BRW_REGISTER_TYPE_D);
-
-   if (devinfo->ver >= 12) {
-      fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));
-
-      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_W);
-      bld.ASR(tmp, g1, brw_imm_d(15));
-      bld.NOT(ff, tmp);
-   } else if (devinfo->ver >= 6) {
-      /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
-       * a boolean result from this (~0/true or 0/false).
-       *
-       * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
-       * this task in only one instruction:
-       *    - a negation source modifier will flip the bit; and
-       *    - a W -> D type conversion will sign extend the bit into the high
-       *      word of the destination.
-       *
-       * An ASR 15 fills the low word of the destination.
-       */
-      fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
-      g0.negate = true;
-
-      bld.ASR(ff, g0, brw_imm_d(15));
-   } else {
-      /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
-       * a boolean result from this (1/true or 0/false).
-       *
-       * Like in the above case, since the bit is the MSB of g1.6:UD we can use
-       * the negation source modifier to flip it. Unfortunately the SHR
-       * instruction only operates on UD (or D with an abs source modifier)
-       * sources without negation.
-       *
-       * Instead, use ASR (which will give ~0/true or 0/false).
-       */
-      fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
-      g1_6.negate = true;
-
-      bld.ASR(ff, g1_6, brw_imm_d(31));
-   }
-
-   return ff;
-}
-
-fs_reg
-fs_visitor::emit_samplepos_setup()
-{
-   assert(stage == MESA_SHADER_FRAGMENT);
-   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
-   assert(devinfo->ver >= 6);
-
-   const fs_builder abld = bld.annotate("compute sample position");
-   fs_reg pos = abld.vgrf(BRW_REGISTER_TYPE_F, 2);
-
-   if (wm_prog_data->persample_dispatch == BRW_NEVER) {
-      /* From ARB_sample_shading specification:
-       * "When rendering to a non-multisample buffer, or if multisample
-       *  rasterization is disabled, gl_SamplePosition will always be
-       *  (0.5, 0.5).
-       */
-      bld.MOV(offset(pos, bld, 0), brw_imm_f(0.5f));
-      bld.MOV(offset(pos, bld, 1), brw_imm_f(0.5f));
-      return pos;
-   }
-
-   /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
-    * mode will be enabled.
-    *
-    * From the Ivy Bridge PRM, volume 2 part 1, page 344:
-    * R31.1:0         Position Offset X/Y for Slot[3:0]
-    * R31.3:2         Position Offset X/Y for Slot[7:4]
-    * .....
-    *
-    * The X, Y sample positions come in as bytes in  thread payload. So, read
-    * the positions using vstride=16, width=8, hstride=2.
-    */
-   const fs_reg sample_pos_reg =
-      fetch_payload_reg(abld, fs_payload().sample_pos_reg, BRW_REGISTER_TYPE_W);
-
-   for (unsigned i = 0; i < 2; i++) {
-      fs_reg tmp_d = bld.vgrf(BRW_REGISTER_TYPE_D);
-      abld.MOV(tmp_d, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, i));
-      /* Convert int_sample_pos to floating point */
-      fs_reg tmp_f = bld.vgrf(BRW_REGISTER_TYPE_F);
-      abld.MOV(tmp_f, tmp_d);
-      /* Scale to the range [0, 1] */
-      abld.MUL(offset(pos, abld, i), tmp_f, brw_imm_f(1 / 16.0f));
-   }
-
-   if (wm_prog_data->persample_dispatch == BRW_SOMETIMES) {
-      check_dynamic_msaa_flag(abld, wm_prog_data,
-                              BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH);
-      for (unsigned i = 0; i < 2; i++) {
-         set_predicate(BRW_PREDICATE_NORMAL,
-                       bld.SEL(offset(pos, abld, i), offset(pos, abld, i),
-                               brw_imm_f(0.5f)));
-      }
-   }
-
-   return pos;
-}
-
-fs_reg
-fs_visitor::emit_sampleid_setup()
-{
-   assert(stage == MESA_SHADER_FRAGMENT);
-   ASSERTED brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
-   assert(devinfo->ver >= 6);
-
-   const fs_builder abld = bld.annotate("compute sample id");
-   fs_reg sample_id = abld.vgrf(BRW_REGISTER_TYPE_UD);
-
-   assert(key->multisample_fbo != BRW_NEVER);
-
-   if (devinfo->ver >= 8) {
-      /* Sample ID comes in as 4-bit numbers in g1.0:
-       *
-       *    15:12 Slot 3 SampleID (only used in SIMD16)
-       *     11:8 Slot 2 SampleID (only used in SIMD16)
-       *      7:4 Slot 1 SampleID
-       *      3:0 Slot 0 SampleID
-       *
-       * Each slot corresponds to four channels, so we want to replicate each
-       * half-byte value to 4 channels in a row:
-       *
-       *    dst+0:    .7    .6    .5    .4    .3    .2    .1    .0
-       *             7:4   7:4   7:4   7:4   3:0   3:0   3:0   3:0
-       *
-       *    dst+1:    .7    .6    .5    .4    .3    .2    .1    .0  (if SIMD16)
-       *           15:12 15:12 15:12 15:12  11:8  11:8  11:8  11:8
-       *
-       * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
-       * channels to read the first byte (7:0), and the second group of 8
-       * channels to read the second byte (15:8).  Then, we shift right by
-       * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
-       * values into place.  Finally, we AND with 0xf to keep the low nibble.
-       *
-       *    shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
-       *    and(16) dst<1>D tmp<8,8,1>W  0xf:W
-       *
-       * TODO: These payload bits exist on Gfx7 too, but they appear to always
-       *       be zero, so this code fails to work.  We should find out why.
-       */
-      const fs_reg tmp = abld.vgrf(BRW_REGISTER_TYPE_UW);
-
-      for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
-         const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
-         hbld.SHR(offset(tmp, hbld, i),
-                  stride(retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UB),
-                         1, 8, 0),
-                  brw_imm_v(0x44440000));
-      }
-
-      abld.AND(sample_id, tmp, brw_imm_w(0xf));
-   } else {
-      const fs_reg t1 = component(abld.vgrf(BRW_REGISTER_TYPE_UD), 0);
-      const fs_reg t2 = abld.vgrf(BRW_REGISTER_TYPE_UW);
-
-      /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
-       * 8x multisampling, subspan 0 will represent sample N (where N
-       * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
-       * 7. We can find the value of N by looking at R0.0 bits 7:6
-       * ("Starting Sample Pair Index (SSPI)") and multiplying by two
-       * (since samples are always delivered in pairs). That is, we
-       * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
-       * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
-       * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
-       * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
-       * populating a temporary variable with the sequence (0, 1, 2, 3),
-       * and then reading from it using vstride=1, width=4, hstride=0.
-       * These computations hold good for 4x multisampling as well.
-       *
-       * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
-       * the first four slots are sample 0 of subspan 0; the next four
-       * are sample 1 of subspan 0; the third group is sample 0 of
-       * subspan 1, and finally sample 1 of subspan 1.
-       */
-
-      /* SKL+ has an extra bit for the Starting Sample Pair Index to
-       * accommodate 16x MSAA.
-       */
-      abld.exec_all().group(1, 0)
-          .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
-               brw_imm_ud(0xc0));
-      abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
-
-      /* This works for SIMD8-SIMD16.  It also works for SIMD32 but only if we
-       * can assume 4x MSAA.  Disallow it on IVB+
-       *
-       * FINISHME: One day, we could come up with a way to do this that
-       * actually works on gfx7.
-       */
-      if (devinfo->ver >= 7)
-         limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gfx7");
-      abld.exec_all().group(8, 0).MOV(t2, brw_imm_v(0x32103210));
-
-      /* This special instruction takes care of setting vstride=1,
-       * width=4, hstride=0 of t2 during an ADD instruction.
-       */
-      abld.emit(FS_OPCODE_SET_SAMPLE_ID, sample_id, t1, t2);
-   }
-
-   if (key->multisample_fbo == BRW_SOMETIMES) {
-      check_dynamic_msaa_flag(abld, wm_prog_data,
-                              BRW_WM_MSAA_FLAG_MULTISAMPLE_FBO);
-      set_predicate(BRW_PREDICATE_NORMAL,
-                    abld.SEL(sample_id, sample_id, brw_imm_ud(0)));
-   }
-
-   return sample_id;
-}
-
-fs_reg
-fs_visitor::emit_samplemaskin_setup()
-{
-   assert(stage == MESA_SHADER_FRAGMENT);
-   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
-   assert(devinfo->ver >= 6);
-
-   /* The HW doesn't provide us with expected values. */
-   assert(wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS);
-
-   fs_reg coverage_mask =
-      fetch_payload_reg(bld, fs_payload().sample_mask_in_reg, BRW_REGISTER_TYPE_D);
-
-   if (wm_prog_data->persample_dispatch == BRW_NEVER)
-      return coverage_mask;
-
-   /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
-    * and a mask representing which sample is being processed by the
-    * current shader invocation.
-    *
-    * From the OES_sample_variables specification:
-    * "When per-sample shading is active due to the use of a fragment input
-    *  qualified by "sample" or due to the use of the gl_SampleID or
-    *  gl_SamplePosition variables, only the bit for the current sample is
-    *  set in gl_SampleMaskIn."
-    */
-   const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
-
-   if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
-      nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup();
-
-   fs_reg one = vgrf(glsl_type::int_type);
-   fs_reg enabled_mask = vgrf(glsl_type::int_type);
-   abld.MOV(one, brw_imm_d(1));
-   abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
-   fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_D);
-   abld.AND(mask, enabled_mask, coverage_mask);
-
-   if (wm_prog_data->persample_dispatch == BRW_ALWAYS)
-      return mask;
-
-   check_dynamic_msaa_flag(abld, wm_prog_data,
-                           BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH);
-   set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask));
-
-   return mask;
-}
-
-fs_reg
-fs_visitor::emit_shading_rate_setup()
-{
-   assert(devinfo->ver >= 11);
-
-   struct brw_wm_prog_data *wm_prog_data =
-      brw_wm_prog_data(bld.shader->stage_prog_data);
-
-   /* Coarse pixel shading size fields overlap with other fields of not in
-    * coarse pixel dispatch mode, so report 0 when that's not the case.
-    */
-   if (wm_prog_data->coarse_pixel_dispatch == BRW_NEVER)
-      return brw_imm_ud(0);
-
-   const fs_builder abld = bld.annotate("compute fragment shading rate");
-
-   /* The shading rates provided in the shader are the actual 2D shading
-    * rate while the SPIR-V built-in is the enum value that has the shading
-    * rate encoded as a bitfield.  Fortunately, the bitfield value is just
-    * the shading rate divided by two and shifted.
-    */
-
-   /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
-   fs_reg actual_x = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
-   /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
-   fs_reg actual_y = byte_offset(actual_x, 1);
-
-   fs_reg int_rate_x = bld.vgrf(BRW_REGISTER_TYPE_UD);
-   fs_reg int_rate_y = bld.vgrf(BRW_REGISTER_TYPE_UD);
-
-   abld.SHR(int_rate_y, actual_y, brw_imm_ud(1));
-   abld.SHR(int_rate_x, actual_x, brw_imm_ud(1));
-   abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2));
-
-   fs_reg rate = abld.vgrf(BRW_REGISTER_TYPE_UD);
-   abld.OR(rate, int_rate_x, int_rate_y);
-
-   if (wm_prog_data->coarse_pixel_dispatch == BRW_ALWAYS)
-      return rate;
-
-   check_dynamic_msaa_flag(abld, wm_prog_data,
-                           BRW_WM_MSAA_FLAG_COARSE_RT_WRITES);
-   set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(rate, rate, brw_imm_ud(0)));
-
-   return rate;
-}
-
-fs_reg
-fs_visitor::resolve_source_modifiers(const fs_builder &bld, const fs_reg &src)
-{
-   if (!src.abs && !src.negate)
-      return src;
-
-   fs_reg temp = bld.vgrf(src.type);
-   bld.MOV(temp, src);
-
-   return temp;
-}
-
 /**
 * Walk backwards from the end of the program looking for a URB write that
 * isn't in control flow, and mark it with EOT.
@ -4847,33 +4474,6 @@ brw_emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
   }
 }

-void
-fs_visitor::emit_is_helper_invocation(fs_reg result)
-{
-   /* Unlike the regular gl_HelperInvocation, that is defined at dispatch,
-    * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into
-    * consideration demoted invocations.
-    */
-   result.type = BRW_REGISTER_TYPE_UD;
-
-   bld.MOV(result, brw_imm_ud(0));
-
-   /* See brw_sample_mask_reg() for why we split SIMD32 into SIMD16 here. */
-   unsigned width = bld.dispatch_width();
-   for (unsigned i = 0; i < DIV_ROUND_UP(width, 16); i++) {
-      const fs_builder b = bld.group(MIN2(width, 16), i);
-
-      fs_inst *mov = b.MOV(offset(result, b, i), brw_imm_ud(~0));
-
-      /* The at() ensures that any code emitted to get the predicate happens
-       * before the mov right above.  This is not an issue elsewhere because
-       * lowering code already set up the builder this way.
-       */
-      brw_emit_predicate_on_sample_mask(b.at(NULL, mov), mov);
-      mov->predicate_inverse = true;
-   }
-}
-
 static bool
 is_mixed_float_with_fp32_dst(const fs_inst *inst)
 {
@ -8130,24 +7730,6 @@ brw_compile_fs(const struct brw_compiler *compiler,
   return g.get_assembly();
 }

-fs_reg
-fs_visitor::emit_work_group_id_setup()
-{
-   assert(gl_shader_stage_is_compute(stage));
-
-   fs_reg id = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
-
-   struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
-   bld.MOV(id, r0_1);
-
-   struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
-   struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
-   bld.MOV(offset(id, bld, 1), r0_6);
-   bld.MOV(offset(id, bld, 2), r0_7);
-
-   return id;
-}
-
 unsigned
 brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
                             unsigned threads)
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@ -34,34 +34,6 @@

 using namespace brw;

-/* Sample from the MCS surface attached to this multisample texture. */
-fs_reg
-fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
-                           const fs_reg &texture,
-                           const fs_reg &texture_handle)
-{
-   const fs_reg dest = vgrf(glsl_type::uvec4_type);
-
-   fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
-   srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
-   srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
-   srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);
-   srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
-   srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components);
-   srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
-   srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0);
-
-   fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
-                            ARRAY_SIZE(srcs));
-
-   /* We only care about one or two regs of response, but the sampler always
-    * writes 4/8.
-    */
-   inst->size_written = 4 * dest.component_size(inst->exec_size);
-
-   return dest;
-}
-
 /* Input data is organized with first the per-primitive values, followed
 * by per-vertex values.  The per-vertex will have interpolation information
 * associated, so use 4 components for each value.
@ -1211,101 +1183,6 @@ fs_visitor::emit_cs_terminate()
   inst->eot = true;
 }

-static void
-setup_barrier_message_payload_gfx125(const fs_builder &bld,
-                                     const fs_reg &msg_payload)
-{
-   assert(bld.shader->devinfo->verx10 >= 125);
-
-   /* From BSpec: 54006, mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */
-   fs_reg m0_10ub = component(retype(msg_payload, BRW_REGISTER_TYPE_UB), 10);
-   fs_reg r0_11ub =
-      stride(suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UB), 11),
-             0, 1, 0);
-   bld.exec_all().group(2, 0).MOV(m0_10ub, r0_11ub);
-}
-
-void
-fs_visitor::emit_barrier()
-{
-   /* We are getting the barrier ID from the compute shader header */
-   assert(gl_shader_stage_uses_workgroup(stage));
-
-   fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-
-   /* Clear the message payload */
-   bld.exec_all().group(8, 0).MOV(payload, brw_imm_ud(0u));
-
-   if (devinfo->verx10 >= 125) {
-      setup_barrier_message_payload_gfx125(bld, payload);
-   } else {
-      assert(gl_shader_stage_is_compute(stage));
-
-      uint32_t barrier_id_mask;
-      switch (devinfo->ver) {
-      case 7:
-      case 8:
-         barrier_id_mask = 0x0f000000u; break;
-      case 9:
-         barrier_id_mask = 0x8f000000u; break;
-      case 11:
-      case 12:
-         barrier_id_mask = 0x7f000000u; break;
-      default:
-         unreachable("barrier is only available on gen >= 7");
-      }
-
-      /* Copy the barrier id from r0.2 to the message payload reg.2 */
-      fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD));
-      bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2,
-                                     brw_imm_ud(barrier_id_mask));
-   }
-
-   /* Emit a gateway "barrier" message using the payload we set up, followed
-    * by a wait instruction.
-    */
-   bld.exec_all().emit(SHADER_OPCODE_BARRIER, reg_undef, payload);
-}
-
-void
-fs_visitor::emit_tcs_barrier()
-{
-   assert(stage == MESA_SHADER_TESS_CTRL);
-   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
-
-   fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-   fs_reg m0_2 = component(m0, 2);
-
-   const fs_builder chanbld = bld.exec_all().group(1, 0);
-
-   /* Zero the message header */
-   bld.exec_all().MOV(m0, brw_imm_ud(0u));
-
-   if (devinfo->verx10 >= 125) {
-      setup_barrier_message_payload_gfx125(bld, m0);
-   } else if (devinfo->ver >= 11) {
-      chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
-                  brw_imm_ud(INTEL_MASK(30, 24)));
-
-      /* Set the Barrier Count and the enable bit */
-      chanbld.OR(m0_2, m0_2,
-                 brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
-   } else {
-      /* Copy "Barrier ID" from r0.2, bits 16:13 */
-      chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
-                  brw_imm_ud(INTEL_MASK(16, 13)));
-
-      /* Shift it up to bits 27:24. */
-      chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
-
-      /* Set the Barrier Count and the enable bit */
-      chanbld.OR(m0_2, m0_2,
-                 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
-   }
-
-   bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
-}
-
 fs_visitor::fs_visitor(const struct brw_compiler *compiler,
                       const struct brw_compile_params *params,
                       const brw_base_prog_key *key,
--- a/src/intel/compiler/brw_mesh.cpp
+++ b/src/intel/compiler/brw_mesh.cpp
@ -1604,700 +1604,3 @@ brw_compile_mesh(const struct brw_compiler *compiler,
   g.add_const_data(nir->constant_data, nir->constant_data_size);
   return g.get_assembly();
 }
-
-static unsigned
-component_from_intrinsic(nir_intrinsic_instr *instr)
-{
-   if (nir_intrinsic_has_component(instr))
-      return nir_intrinsic_component(instr);
-   else
-      return 0;
-}
-
-static void
-adjust_handle_and_offset(const fs_builder &bld,
-                         fs_reg &urb_handle,
-                         unsigned &urb_global_offset)
-{
-   /* Make sure that URB global offset is below 2048 (2^11), because
-    * that's the maximum possible value encoded in Message Descriptor.
-    */
-   unsigned adjustment = (urb_global_offset >> 11) << 11;
-
-   if (adjustment) {
-      fs_builder ubld8 = bld.group(8, 0).exec_all();
-      /* Allocate new register to not overwrite the shared URB handle. */
-      fs_reg new_handle = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
-      ubld8.ADD(new_handle, urb_handle, brw_imm_ud(adjustment));
-      urb_handle = new_handle;
-      urb_global_offset -= adjustment;
-   }
-}
-
-static void
-emit_urb_direct_vec4_write(const fs_builder &bld,
-                           unsigned urb_global_offset,
-                           const fs_reg &src,
-                           fs_reg urb_handle,
-                           unsigned dst_comp_offset,
-                           unsigned comps,
-                           unsigned mask)
-{
-   for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
-      fs_builder bld8 = bld.group(8, q);
-
-      fs_reg payload_srcs[8];
-      unsigned length = 0;
-
-      for (unsigned i = 0; i < dst_comp_offset; i++)
-         payload_srcs[length++] = reg_undef;
-
-      for (unsigned c = 0; c < comps; c++)
-         payload_srcs[length++] = quarter(offset(src, bld, c), q);
-
-      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
-      srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
-      srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
-      srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
-                                          BRW_REGISTER_TYPE_F);
-      srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
-      bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
-
-      fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
-                                reg_undef, srcs, ARRAY_SIZE(srcs));
-      inst->offset = urb_global_offset;
-      assert(inst->offset < 2048);
-   }
-}
-
-static void
-emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
-                       const fs_reg &src, fs_reg urb_handle)
-{
-   assert(nir_src_bit_size(instr->src[0]) == 32);
-
-   nir_src *offset_nir_src = nir_get_io_offset_src(instr);
-   assert(nir_src_is_const(*offset_nir_src));
-
-   const unsigned comps = nir_src_num_components(instr->src[0]);
-   assert(comps <= 4);
-
-   const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
-                                     nir_src_as_uint(*offset_nir_src) +
-                                     component_from_intrinsic(instr);
-
-   /* URB writes are vec4 aligned but the intrinsic offsets are in dwords.
-    * We can write up to 8 dwords, so single vec4 write is enough.
-    */
-   const unsigned comp_shift = offset_in_dwords % 4;
-   const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift;
-
-   unsigned urb_global_offset = offset_in_dwords / 4;
-   adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
-
-   emit_urb_direct_vec4_write(bld, urb_global_offset, src, urb_handle,
-                              comp_shift, comps, mask);
-}
-
-static void
-emit_urb_direct_vec4_write_xe2(const fs_builder &bld,
-                               unsigned offset_in_bytes,
-                               const fs_reg &src,
-                               fs_reg urb_handle,
-                               unsigned comps,
-                               unsigned mask)
-{
-   const struct intel_device_info *devinfo = bld.shader->devinfo;
-   const unsigned runit = reg_unit(devinfo);
-   const unsigned write_size = 8 * runit;
-
-   if (offset_in_bytes > 0) {
-      fs_builder bldall = bld.group(write_size, 0).exec_all();
-      fs_reg new_handle = bldall.vgrf(BRW_REGISTER_TYPE_UD);
-      bldall.ADD(new_handle, urb_handle, brw_imm_ud(offset_in_bytes));
-      urb_handle = new_handle;
-   }
-
-   for (unsigned q = 0; q < bld.dispatch_width() / write_size; q++) {
-      fs_builder hbld = bld.group(write_size, q);
-
-      fs_reg payload_srcs[comps];
-
-      for (unsigned c = 0; c < comps; c++)
-         payload_srcs[c] = horiz_offset(offset(src, bld, c), write_size * q);
-
-      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
-      srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
-      srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
-      int nr = bld.shader->alloc.allocate(comps * runit);
-      srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, nr, BRW_REGISTER_TYPE_F);
-      srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(comps);
-      hbld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, comps, 0);
-
-      hbld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
-                reg_undef, srcs, ARRAY_SIZE(srcs));
-   }
-}
-
-static void
-emit_urb_direct_writes_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
-                           const fs_reg &src, fs_reg urb_handle)
-{
-   assert(nir_src_bit_size(instr->src[0]) == 32);
-
-   nir_src *offset_nir_src = nir_get_io_offset_src(instr);
-   assert(nir_src_is_const(*offset_nir_src));
-
-   const unsigned comps = nir_src_num_components(instr->src[0]);
-   assert(comps <= 4);
-
-   const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
-                                     nir_src_as_uint(*offset_nir_src) +
-                                     component_from_intrinsic(instr);
-
-   const unsigned mask = nir_intrinsic_write_mask(instr);
-
-   emit_urb_direct_vec4_write_xe2(bld, offset_in_dwords * 4, src,
-                                    urb_handle, comps, mask);
-}
-
-static void
-emit_urb_indirect_vec4_write(const fs_builder &bld,
-                             const fs_reg &offset_src,
-                             unsigned base,
-                             const fs_reg &src,
-                             fs_reg urb_handle,
-                             unsigned dst_comp_offset,
-                             unsigned comps,
-                             unsigned mask)
-{
-   for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
-      fs_builder bld8 = bld.group(8, q);
-
-      /* offset is always positive, so signedness doesn't matter */
-      assert(offset_src.type == BRW_REGISTER_TYPE_D ||
-             offset_src.type == BRW_REGISTER_TYPE_UD);
-      fs_reg off = bld8.vgrf(offset_src.type, 1);
-      bld8.MOV(off, quarter(offset_src, q));
-      bld8.ADD(off, off, brw_imm_ud(base));
-      bld8.SHR(off, off, brw_imm_ud(2));
-
-      fs_reg payload_srcs[8];
-      unsigned length = 0;
-
-      for (unsigned i = 0; i < dst_comp_offset; i++)
-         payload_srcs[length++] = reg_undef;
-
-      for (unsigned c = 0; c < comps; c++)
-         payload_srcs[length++] = quarter(offset(src, bld, c), q);
-
-      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
-      srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
-      srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
-      srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
-      srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
-                                          BRW_REGISTER_TYPE_F);
-      srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
-      bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
-
-      fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
-                                reg_undef, srcs, ARRAY_SIZE(srcs));
-      inst->offset = 0;
-   }
-}
-
-static void
-emit_urb_indirect_writes_mod(const fs_builder &bld, nir_intrinsic_instr *instr,
-                             const fs_reg &src, const fs_reg &offset_src,
-                             fs_reg urb_handle, unsigned mod)
-{
-   assert(nir_src_bit_size(instr->src[0]) == 32);
-
-   const unsigned comps = nir_src_num_components(instr->src[0]);
-   assert(comps <= 4);
-
-   const unsigned base_in_dwords = nir_intrinsic_base(instr) +
-                                   component_from_intrinsic(instr);
-
-   const unsigned comp_shift = mod;
-   const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift;
-
-   emit_urb_indirect_vec4_write(bld, offset_src, base_in_dwords, src,
-                                urb_handle, comp_shift, comps, mask);
-}
-
-static void
-emit_urb_indirect_writes_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
-                             const fs_reg &src, const fs_reg &offset_src,
-                             fs_reg urb_handle)
-{
-   assert(nir_src_bit_size(instr->src[0]) == 32);
-
-   const struct intel_device_info *devinfo = bld.shader->devinfo;
-   const unsigned runit = reg_unit(devinfo);
-   const unsigned write_size = 8 * runit;
-
-   const unsigned comps = nir_src_num_components(instr->src[0]);
-   assert(comps <= 4);
-
-   const unsigned base_in_dwords = nir_intrinsic_base(instr) +
-                                   component_from_intrinsic(instr);
-
-   if (base_in_dwords > 0) {
-      fs_builder bldall = bld.group(write_size, 0).exec_all();
-      fs_reg new_handle = bldall.vgrf(BRW_REGISTER_TYPE_UD);
-      bldall.ADD(new_handle, urb_handle, brw_imm_ud(base_in_dwords * 4));
-      urb_handle = new_handle;
-   }
-
-   const unsigned mask = nir_intrinsic_write_mask(instr);
-
-   for (unsigned q = 0; q < bld.dispatch_width() / write_size; q++) {
-      fs_builder wbld = bld.group(write_size, q);
-
-      fs_reg payload_srcs[comps];
-
-      for (unsigned c = 0; c < comps; c++)
-         payload_srcs[c] = horiz_offset(offset(src, bld, c), write_size * q);
-
-      fs_reg addr = wbld.vgrf(BRW_REGISTER_TYPE_UD);
-      wbld.SHL(addr, horiz_offset(offset_src, write_size * q), brw_imm_ud(2));
-      wbld.ADD(addr, addr, urb_handle);
-
-      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
-      srcs[URB_LOGICAL_SRC_HANDLE] = addr;
-      srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
-      int nr = bld.shader->alloc.allocate(comps * runit);
-      srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, nr, BRW_REGISTER_TYPE_F);
-      srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(comps);
-      wbld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, comps, 0);
-
-      wbld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
-                reg_undef, srcs, ARRAY_SIZE(srcs));
-   }
-}
-
-static void
-emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
-                         const fs_reg &src, const fs_reg &offset_src,
-                         fs_reg urb_handle)
-{
-   assert(nir_src_bit_size(instr->src[0]) == 32);
-
-   const unsigned comps = nir_src_num_components(instr->src[0]);
-   assert(comps <= 4);
-
-   const unsigned base_in_dwords = nir_intrinsic_base(instr) +
-                                   component_from_intrinsic(instr);
-
-   /* Use URB write message that allow different offsets per-slot.  The offset
-    * is in units of vec4s (128 bits), so we use a write for each component,
-    * replicating it in the sources and applying the appropriate mask based on
-    * the dword offset.
-    */
-
-   for (unsigned c = 0; c < comps; c++) {
-      if (((1 << c) & nir_intrinsic_write_mask(instr)) == 0)
-         continue;
-
-      fs_reg src_comp = offset(src, bld, c);
-
-      for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
-         fs_builder bld8 = bld.group(8, q);
-
-         /* offset is always positive, so signedness doesn't matter */
-         assert(offset_src.type == BRW_REGISTER_TYPE_D ||
-                offset_src.type == BRW_REGISTER_TYPE_UD);
-         fs_reg off = bld8.vgrf(offset_src.type, 1);
-         bld8.MOV(off, quarter(offset_src, q));
-         bld8.ADD(off, off, brw_imm_ud(c + base_in_dwords));
-
-         fs_reg mask = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
-         bld8.AND(mask, off, brw_imm_ud(0x3));
-
-         fs_reg one = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
-         bld8.MOV(one, brw_imm_ud(1));
-         bld8.SHL(mask, one, mask);
-         bld8.SHL(mask, mask, brw_imm_ud(16));
-
-         bld8.SHR(off, off, brw_imm_ud(2));
-
-         fs_reg payload_srcs[4];
-         unsigned length = 0;
-
-         for (unsigned j = 0; j < 4; j++)
-            payload_srcs[length++] = quarter(src_comp, q);
-
-         fs_reg srcs[URB_LOGICAL_NUM_SRCS];
-         srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
-         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
-         srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask;
-         srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
-                                             BRW_REGISTER_TYPE_F);
-         srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
-         bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
-
-         fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
-                                   reg_undef, srcs, ARRAY_SIZE(srcs));
-         inst->offset = 0;
-      }
-   }
-}
-
-static void
-emit_urb_direct_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
-                      const fs_reg &dest, fs_reg urb_handle)
-{
-   assert(instr->def.bit_size == 32);
-
-   unsigned comps = instr->def.num_components;
-   if (comps == 0)
-      return;
-
-   nir_src *offset_nir_src = nir_get_io_offset_src(instr);
-   assert(nir_src_is_const(*offset_nir_src));
-
-   const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
-                                     nir_src_as_uint(*offset_nir_src) +
-                                     component_from_intrinsic(instr);
-
-   unsigned urb_global_offset = offset_in_dwords / 4;
-   adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
-
-   const unsigned comp_offset = offset_in_dwords % 4;
-   const unsigned num_regs = comp_offset + comps;
-
-   fs_builder ubld8 = bld.group(8, 0).exec_all();
-   fs_reg data = ubld8.vgrf(BRW_REGISTER_TYPE_UD, num_regs);
-   fs_reg srcs[URB_LOGICAL_NUM_SRCS];
-   srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
-
-   fs_inst *inst = ubld8.emit(SHADER_OPCODE_URB_READ_LOGICAL, data,
-                              srcs, ARRAY_SIZE(srcs));
-   inst->offset = urb_global_offset;
-   assert(inst->offset < 2048);
-   inst->size_written = num_regs * REG_SIZE;
-
-   for (unsigned c = 0; c < comps; c++) {
-      fs_reg dest_comp = offset(dest, bld, c);
-      fs_reg data_comp = horiz_stride(offset(data, ubld8, comp_offset + c), 0);
-      bld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp);
-   }
-}
-
-static void
-emit_urb_direct_reads_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
-                          const fs_reg &dest, fs_reg urb_handle)
-{
-   assert(instr->def.bit_size == 32);
-
-   unsigned comps = instr->def.num_components;
-   if (comps == 0)
-      return;
-
-   nir_src *offset_nir_src = nir_get_io_offset_src(instr);
-   assert(nir_src_is_const(*offset_nir_src));
-
-   fs_builder ubld16 = bld.group(16, 0).exec_all();
-
-   const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
-                                     nir_src_as_uint(*offset_nir_src) +
-                                     component_from_intrinsic(instr);
-
-   if (offset_in_dwords > 0) {
-      fs_reg new_handle = ubld16.vgrf(BRW_REGISTER_TYPE_UD);
-      ubld16.ADD(new_handle, urb_handle, brw_imm_ud(offset_in_dwords * 4));
-      urb_handle = new_handle;
-   }
-
-   fs_reg data = ubld16.vgrf(BRW_REGISTER_TYPE_UD, comps);
-   fs_reg srcs[URB_LOGICAL_NUM_SRCS];
-   srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
-
-   fs_inst *inst = ubld16.emit(SHADER_OPCODE_URB_READ_LOGICAL,
-                               data, srcs, ARRAY_SIZE(srcs));
-   inst->size_written = 2 * comps * REG_SIZE;
-
-   for (unsigned c = 0; c < comps; c++) {
-      fs_reg dest_comp = offset(dest, bld, c);
-      fs_reg data_comp = horiz_stride(offset(data, ubld16, c), 0);
-      bld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp);
-   }
-}
-
-static void
-emit_urb_indirect_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
-                        const fs_reg &dest, const fs_reg &offset_src, fs_reg urb_handle)
-{
-   assert(instr->def.bit_size == 32);
-
-   unsigned comps = instr->def.num_components;
-   if (comps == 0)
-      return;
-
-   fs_reg seq_ud;
-   {
-      fs_builder ubld8 = bld.group(8, 0).exec_all();
-      seq_ud = ubld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
-      fs_reg seq_uw = ubld8.vgrf(BRW_REGISTER_TYPE_UW, 1);
-      ubld8.MOV(seq_uw, fs_reg(brw_imm_v(0x76543210)));
-      ubld8.MOV(seq_ud, seq_uw);
-      ubld8.SHL(seq_ud, seq_ud, brw_imm_ud(2));
-   }
-
-   const unsigned base_in_dwords = nir_intrinsic_base(instr) +
-                                   component_from_intrinsic(instr);
-
-   for (unsigned c = 0; c < comps; c++) {
-      for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
-         fs_builder bld8 = bld.group(8, q);
-
-         /* offset is always positive, so signedness doesn't matter */
-         assert(offset_src.type == BRW_REGISTER_TYPE_D ||
-                offset_src.type == BRW_REGISTER_TYPE_UD);
-         fs_reg off = bld8.vgrf(offset_src.type, 1);
-         bld8.MOV(off, quarter(offset_src, q));
-         bld8.ADD(off, off, brw_imm_ud(base_in_dwords + c));
-
-         STATIC_ASSERT(IS_POT(REG_SIZE) && REG_SIZE > 1);
-
-         fs_reg comp = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
-         bld8.AND(comp, off, brw_imm_ud(0x3));
-         bld8.SHL(comp, comp, brw_imm_ud(ffs(REG_SIZE) - 1));
-         bld8.ADD(comp, comp, seq_ud);
-
-         bld8.SHR(off, off, brw_imm_ud(2));
-
-         fs_reg srcs[URB_LOGICAL_NUM_SRCS];
-         srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
-         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
-
-         fs_reg data = bld8.vgrf(BRW_REGISTER_TYPE_UD, 4);
-
-         fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_READ_LOGICAL,
-                                   data, srcs, ARRAY_SIZE(srcs));
-         inst->offset = 0;
-         inst->size_written = 4 * REG_SIZE;
-
-         fs_reg dest_comp = offset(dest, bld, c);
-         bld8.emit(SHADER_OPCODE_MOV_INDIRECT,
-                   retype(quarter(dest_comp, q), BRW_REGISTER_TYPE_UD),
-                   data,
-                   comp,
-                   brw_imm_ud(4 * REG_SIZE));
-      }
-   }
-}
-
-static void
-emit_urb_indirect_reads_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
-                            const fs_reg &dest, const fs_reg &offset_src,
-                            fs_reg urb_handle)
-{
-   assert(instr->def.bit_size == 32);
-
-   unsigned comps = instr->def.num_components;
-   if (comps == 0)
-      return;
-
-   fs_builder ubld16 = bld.group(16, 0).exec_all();
-
-   const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
-                                     component_from_intrinsic(instr);
-
-   if (offset_in_dwords > 0) {
-      fs_reg new_handle = ubld16.vgrf(BRW_REGISTER_TYPE_UD);
-      ubld16.ADD(new_handle, urb_handle, brw_imm_ud(offset_in_dwords * 4));
-      urb_handle = new_handle;
-   }
-
-   fs_reg data = ubld16.vgrf(BRW_REGISTER_TYPE_UD, comps);
-
-
-   for (unsigned q = 0; q < bld.dispatch_width() / 16; q++) {
-      fs_builder wbld = bld.group(16, q);
-
-      fs_reg addr = wbld.vgrf(BRW_REGISTER_TYPE_UD);
-      wbld.SHL(addr, horiz_offset(offset_src, 16 * q), brw_imm_ud(2));
-      wbld.ADD(addr, addr, urb_handle);
-
-      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
-      srcs[URB_LOGICAL_SRC_HANDLE] = addr;
-
-      fs_inst *inst = wbld.emit(SHADER_OPCODE_URB_READ_LOGICAL,
-                                 data, srcs, ARRAY_SIZE(srcs));
-      inst->size_written = 2 * comps * REG_SIZE;
-
-      for (unsigned c = 0; c < comps; c++) {
-         fs_reg dest_comp = horiz_offset(offset(dest, bld, c), 16 * q);
-         fs_reg data_comp = offset(data, wbld, c);
-         wbld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp);
-      }
-   }
-}
-
-void
-fs_visitor::emit_task_mesh_store(const fs_builder &bld, nir_intrinsic_instr *instr,
-                                 const fs_reg &urb_handle)
-{
-   fs_reg src = get_nir_src(instr->src[0]);
-   nir_src *offset_nir_src = nir_get_io_offset_src(instr);
-
-   if (nir_src_is_const(*offset_nir_src)) {
-      if (bld.shader->devinfo->ver >= 20)
-         emit_urb_direct_writes_xe2(bld, instr, src, urb_handle);
-      else
-         emit_urb_direct_writes(bld, instr, src, urb_handle);
-   } else {
-      if (bld.shader->devinfo->ver >= 20) {
-         emit_urb_indirect_writes_xe2(bld, instr, src, get_nir_src(*offset_nir_src), urb_handle);
-         return;
-      }
-      bool use_mod = false;
-      unsigned mod;
-
-      /* Try to calculate the value of (offset + base) % 4. If we can do
-       * this, then we can do indirect writes using only 1 URB write.
-       */
-      use_mod = nir_mod_analysis(nir_get_scalar(offset_nir_src->ssa, 0), nir_type_uint, 4, &mod);
-      if (use_mod) {
-         mod += nir_intrinsic_base(instr) + component_from_intrinsic(instr);
-         mod %= 4;
-      }
-
-      if (use_mod) {
-         emit_urb_indirect_writes_mod(bld, instr, src, get_nir_src(*offset_nir_src), urb_handle, mod);
-      } else {
-         emit_urb_indirect_writes(bld, instr, src, get_nir_src(*offset_nir_src), urb_handle);
-      }
-   }
-}
-
-void
-fs_visitor::emit_task_mesh_load(const fs_builder &bld, nir_intrinsic_instr *instr,
-                                const fs_reg &urb_handle)
-{
-   fs_reg dest = get_nir_def(instr->def);
-   nir_src *offset_nir_src = nir_get_io_offset_src(instr);
-
-   /* TODO(mesh): for per_vertex and per_primitive, if we could keep around
-    * the non-array-index offset, we could use to decide if we can perform
-    * a single large aligned read instead one per component.
-    */
-
-   if (nir_src_is_const(*offset_nir_src)) {
-      if (bld.shader->devinfo->ver >= 20)
-         emit_urb_direct_reads_xe2(bld, instr, dest, urb_handle);
-      else
-         emit_urb_direct_reads(bld, instr, dest, urb_handle);
-   } else {
-      if (bld.shader->devinfo->ver >= 20)
-         emit_urb_indirect_reads_xe2(bld, instr, dest, get_nir_src(*offset_nir_src), urb_handle);
-      else
-         emit_urb_indirect_reads(bld, instr, dest, get_nir_src(*offset_nir_src), urb_handle);
-   }
-}
-
-void
-fs_visitor::nir_emit_task_intrinsic(const fs_builder &bld,
-                                    nir_intrinsic_instr *instr)
-{
-   assert(stage == MESA_SHADER_TASK);
-   const task_mesh_thread_payload &payload = task_mesh_payload();
-
-   switch (instr->intrinsic) {
-   case nir_intrinsic_store_output:
-   case nir_intrinsic_store_task_payload:
-      emit_task_mesh_store(bld, instr, payload.urb_output);
-      break;
-
-   case nir_intrinsic_load_output:
-   case nir_intrinsic_load_task_payload:
-      emit_task_mesh_load(bld, instr, payload.urb_output);
-      break;
-
-   default:
-      nir_emit_task_mesh_intrinsic(bld, instr);
-      break;
-   }
-}
-
-void
-fs_visitor::nir_emit_mesh_intrinsic(const fs_builder &bld,
-                                    nir_intrinsic_instr *instr)
-{
-   assert(stage == MESA_SHADER_MESH);
-   const task_mesh_thread_payload &payload = task_mesh_payload();
-
-   switch (instr->intrinsic) {
-   case nir_intrinsic_store_per_primitive_output:
-   case nir_intrinsic_store_per_vertex_output:
-   case nir_intrinsic_store_output:
-      emit_task_mesh_store(bld, instr, payload.urb_output);
-      break;
-
-   case nir_intrinsic_load_per_vertex_output:
-   case nir_intrinsic_load_per_primitive_output:
-   case nir_intrinsic_load_output:
-      emit_task_mesh_load(bld, instr, payload.urb_output);
-      break;
-
-   case nir_intrinsic_load_task_payload:
-      emit_task_mesh_load(bld, instr, payload.task_urb_input);
-      break;
-
-   default:
-      nir_emit_task_mesh_intrinsic(bld, instr);
-      break;
-   }
-}
-
-void
-fs_visitor::nir_emit_task_mesh_intrinsic(const fs_builder &bld,
-                                         nir_intrinsic_instr *instr)
-{
-   assert(stage == MESA_SHADER_MESH || stage == MESA_SHADER_TASK);
-   const task_mesh_thread_payload &payload = task_mesh_payload();
-
-   fs_reg dest;
-   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
-      dest = get_nir_def(instr->def);
-
-   switch (instr->intrinsic) {
-   case nir_intrinsic_load_mesh_inline_data_intel: {
-      fs_reg data = offset(payload.inline_parameter, 1, nir_intrinsic_align_offset(instr));
-      bld.MOV(dest, retype(data, dest.type));
-      break;
-   }
-
-   case nir_intrinsic_load_draw_id:
-      dest = retype(dest, BRW_REGISTER_TYPE_UD);
-      bld.MOV(dest, payload.extended_parameter_0);
-      break;
-
-   case nir_intrinsic_load_local_invocation_id:
-      unreachable("local invocation id should have been lowered earlier");
-      break;
-
-   case nir_intrinsic_load_local_invocation_index:
-      dest = retype(dest, BRW_REGISTER_TYPE_UD);
-      bld.MOV(dest, payload.local_index);
-      break;
-
-   case nir_intrinsic_load_num_workgroups:
-      dest = retype(dest, BRW_REGISTER_TYPE_UD);
-      bld.MOV(offset(dest, bld, 0), brw_uw1_grf(0, 13)); /* g0.6 >> 16 */
-      bld.MOV(offset(dest, bld, 1), brw_uw1_grf(0, 8));  /* g0.4 & 0xffff */
-      bld.MOV(offset(dest, bld, 2), brw_uw1_grf(0, 9));  /* g0.4 >> 16 */
-      break;
-
-   case nir_intrinsic_load_workgroup_index:
-      dest = retype(dest, BRW_REGISTER_TYPE_UD);
-      bld.MOV(dest, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
-      break;
-
-   default:
-      nir_emit_cs_intrinsic(bld, instr);
-      break;
-   }
-}