brw: lower non coherent FS load_output in NIR

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37527>
2025-12-22 07:00:12 +01:00 · 2025-09-02 13:39:29 +03:00 · 2025-09-02 13:39:29 +03:00 · d4ab2087cf
commit d4ab2087cf
parent 3b6b03bd3b
5 changed files with 69 additions and 78 deletions
--- a/src/intel/compiler/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw_compile_fs.cpp
@ -1518,6 +1518,9 @@ brw_compile_fs(const struct brw_compiler *compiler,
   brw_nir_lower_fs_inputs(nir, devinfo, key);
   brw_nir_lower_fs_outputs(nir);
   if (!key->coherent_fb_fetch)
      NIR_PASS(_, nir, brw_nir_lower_fs_load_output, key);
   /* From the SKL PRM, Volume 7, "Alpha Coverage":
    *  "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
    *   hardware, regardless of the state setting for this feature."
--- a/src/intel/compiler/brw_from_nir.cpp
+++ b/src/intel/compiler/brw_from_nir.cpp
@ -3647,80 +3647,6 @@ emit_mcs_fetch(nir_to_brw_state &ntb, const brw_reg &coordinate, unsigned compon
   return dest;
 }
 /**
 * Fake non-coherent framebuffer read implemented using TXF to fetch from the
 * framebuffer at the current fragment coordinates and sample index.
 */
 static brw_inst *
 emit_non_coherent_fb_read(nir_to_brw_state &ntb, const brw_builder &bld, const brw_reg &dst,
                          unsigned target)
 {
   brw_shader &s = ntb.s;
   const struct intel_device_info *devinfo = s.devinfo;
   assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
   const brw_wm_prog_key *wm_key =
      reinterpret_cast<const brw_wm_prog_key *>(s.key);
   assert(!wm_key->coherent_fb_fetch);
   /* Calculate the fragment coordinates. */
   const brw_reg coords = bld.vgrf(BRW_TYPE_UD, 3);
   bld.MOV(offset(coords, bld, 0), s.pixel_x);
   bld.MOV(offset(coords, bld, 1), s.pixel_y);
   bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
   /* Calculate the sample index and MCS payload when multisampling.  Luckily
    * the MCS fetch message behaves deterministically for UMS surfaces, so it
    * shouldn't be necessary to recompile based on whether the framebuffer is
    * CMS or UMS.
    */
   assert(wm_key->multisample_fbo == INTEL_ALWAYS ||
          wm_key->multisample_fbo == INTEL_NEVER);
   if (wm_key->multisample_fbo &&
       ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
      ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
   const brw_reg sample = ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
   const brw_reg mcs = wm_key->multisample_fbo ?
      emit_mcs_fetch(ntb, coords, 3, brw_imm_ud(target), brw_reg()) : brw_reg();
   /* Use either a normal or a CMS texel fetch message depending on whether
    * the framebuffer is single or multisample.  On SKL+ use the wide CMS
    * message just in case the framebuffer uses 16x multisampling, it should
    * be equivalent to the normal CMS fetch for lower multisampling modes.
    */
   opcode op;
   if (wm_key->multisample_fbo) {
      /* On SKL+ use the wide CMS message just in case the framebuffer uses 16x
       * multisampling, it should be equivalent to the normal CMS fetch for
       * lower multisampling modes.
       *
       * On Gfx12HP, there is only CMS_W variant available.
       */
      if (devinfo->verx10 >= 125)
         op = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
      else
         op = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
   } else {
      op = SHADER_OPCODE_TXF_LOGICAL;
   }
   /* Emit the instruction. */
   brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
   srcs[TEX_LOGICAL_SRC_COORDINATE]       = coords;
   srcs[TEX_LOGICAL_SRC_LOD]              = brw_imm_ud(0);
   srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX]     = sample;
   srcs[TEX_LOGICAL_SRC_MCS]              = mcs;
   srcs[TEX_LOGICAL_SRC_SURFACE]          = brw_imm_ud(target);
   srcs[TEX_LOGICAL_SRC_SAMPLER]          = brw_imm_ud(0);
   brw_tex_inst *tex = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs))->as_tex();
   tex->size_written = 4 * tex->dst.component_size(tex->exec_size);
   tex->coord_components = 3;
   return tex;
 }
 /**
 * Actual coherent framebuffer read implemented using the native render target
 * read message.  Requires SKL+.
@ -4260,10 +4186,8 @@ brw_from_nir_emit_fs_intrinsic(nir_to_brw_state &ntb,
      const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
      const brw_reg tmp = bld.vgrf(dest.type, 4);
-      if (reinterpret_cast<const brw_wm_prog_key *>(s.key)->coherent_fb_fetch)
+      assert(reinterpret_cast<const brw_wm_prog_key *>(s.key)->coherent_fb_fetch);
-         emit_coherent_fb_read(bld, tmp, target);
+      emit_coherent_fb_read(bld, tmp, target);
      else
         emit_non_coherent_fb_read(ntb, bld, tmp, target);
      brw_combine_with_vec(bld, dest,
                           offset(tmp, bld, nir_intrinsic_component(instr)),
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@ -192,6 +192,8 @@ void brw_nir_lower_tcs_outputs(nir_shader *nir,
                               const struct intel_vue_map *vue,
                               enum tess_primitive_mode tes_primitive_mode);
 void brw_nir_lower_fs_outputs(nir_shader *nir);
 bool brw_nir_lower_fs_load_output(nir_shader *shader,
                                  const struct brw_wm_prog_key *key);
 bool brw_nir_lower_cmat(nir_shader *nir, unsigned subgroup_size);
--- a/src/intel/compiler/brw_nir_lower_fs_load_output.c
+++ b/src/intel/compiler/brw_nir_lower_fs_load_output.c
@ -0,0 +1,61 @@
 /*
 * Copyright © 2025 Intel Corporation
 * SPDX-License-Identifier: MIT
 */
 #include "brw_nir.h"
 #include "compiler/nir/nir_builder.h"
 /**
 * Lower fragment shader output reads into sampler operations.
 */
 static bool
 brw_nir_lower_fs_load_output_instr(nir_builder *b,
                                   nir_intrinsic_instr *intrin,
                                   void *data)
 {
   if (intrin->intrinsic != nir_intrinsic_load_output)
      return false;
   const struct brw_wm_prog_key *key = data;
   /* Only used by Iris that never sets this to SOMETIMES */
   assert(key->multisample_fbo != INTEL_SOMETIMES);
   b->cursor = nir_before_instr(&intrin->instr);
   nir_def *coords[3] = {
      nir_f2u32(b, nir_channel(b, nir_load_frag_coord(b), 0)),
      nir_f2u32(b, nir_channel(b, nir_load_frag_coord(b), 1)),
      nir_load_layer_id(b),
   };
   nir_def *coord = nir_vec(b, coords, 3);
   nir_def *tex =
      key->multisample_fbo == INTEL_NEVER ?
      nir_build_tex(b, nir_texop_txf, coord,
                    .texture_index = nir_intrinsic_base(intrin),
                    .dim = GLSL_SAMPLER_DIM_2D,
                    .is_array = true,
                    .dest_type = nir_type_uint32) :
      nir_build_tex(b, nir_texop_txf_ms, coord,
                    .texture_index = nir_intrinsic_base(intrin),
                    .ms_index = nir_load_sample_id(b),
                    .dim = GLSL_SAMPLER_DIM_MS,
                    .dest_type = nir_type_uint32);
   nir_def_replace(&intrin->def, tex);
   return true;
 }
 bool
 brw_nir_lower_fs_load_output(nir_shader *shader,
                             const struct brw_wm_prog_key *key)
 {
   return nir_shader_intrinsics_pass(shader,
                                     brw_nir_lower_fs_load_output_instr,
                                     nir_metadata_control_flow,
                                     (void *) key);
 }
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@ -73,6 +73,7 @@ libintel_compiler_brw_files = files(
  'brw_nir_lower_cs_intrinsics.c',
  'brw_nir_lower_alpha_to_coverage.c',
  'brw_nir_lower_fs_barycentrics.c',
  'brw_nir_lower_fs_load_output.c',
  'brw_nir_lower_immediate_offsets.c',
  'brw_nir_lower_intersection_shader.c',
  'brw_nir_lower_ray_queries.c',