brw: lower non coherent FS load_output in NIR

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37527>
2025-12-22 00:30:13 +01:00 · 2025-09-02 13:39:29 +03:00 · 2025-09-02 13:39:29 +03:00 · d4ab2087cf
commit d4ab2087cf
parent 3b6b03bd3b
5 changed files with 69 additions and 78 deletions
--- a/src/intel/compiler/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw_compile_fs.cpp
@ -1518,6 +1518,9 @@ brw_compile_fs(const struct brw_compiler *compiler,
   brw_nir_lower_fs_inputs(nir, devinfo, key);
   brw_nir_lower_fs_outputs(nir);

+   if (!key->coherent_fb_fetch)
+      NIR_PASS(_, nir, brw_nir_lower_fs_load_output, key);
+
   /* From the SKL PRM, Volume 7, "Alpha Coverage":
    *  "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
    *   hardware, regardless of the state setting for this feature."
--- a/src/intel/compiler/brw_from_nir.cpp
+++ b/src/intel/compiler/brw_from_nir.cpp
@ -3647,80 +3647,6 @@ emit_mcs_fetch(nir_to_brw_state &ntb, const brw_reg &coordinate, unsigned compon
   return dest;
 }

-/**
- * Fake non-coherent framebuffer read implemented using TXF to fetch from the
- * framebuffer at the current fragment coordinates and sample index.
- */
-static brw_inst *
-emit_non_coherent_fb_read(nir_to_brw_state &ntb, const brw_builder &bld, const brw_reg &dst,
-                          unsigned target)
-{
-   brw_shader &s = ntb.s;
-   const struct intel_device_info *devinfo = s.devinfo;
-
-   assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
-   const brw_wm_prog_key *wm_key =
-      reinterpret_cast<const brw_wm_prog_key *>(s.key);
-   assert(!wm_key->coherent_fb_fetch);
-
-   /* Calculate the fragment coordinates. */
-   const brw_reg coords = bld.vgrf(BRW_TYPE_UD, 3);
-   bld.MOV(offset(coords, bld, 0), s.pixel_x);
-   bld.MOV(offset(coords, bld, 1), s.pixel_y);
-   bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
-
-   /* Calculate the sample index and MCS payload when multisampling.  Luckily
-    * the MCS fetch message behaves deterministically for UMS surfaces, so it
-    * shouldn't be necessary to recompile based on whether the framebuffer is
-    * CMS or UMS.
-    */
-   assert(wm_key->multisample_fbo == INTEL_ALWAYS ||
-          wm_key->multisample_fbo == INTEL_NEVER);
-   if (wm_key->multisample_fbo &&
-       ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
-      ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
-
-   const brw_reg sample = ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
-   const brw_reg mcs = wm_key->multisample_fbo ?
-      emit_mcs_fetch(ntb, coords, 3, brw_imm_ud(target), brw_reg()) : brw_reg();
-
-   /* Use either a normal or a CMS texel fetch message depending on whether
-    * the framebuffer is single or multisample.  On SKL+ use the wide CMS
-    * message just in case the framebuffer uses 16x multisampling, it should
-    * be equivalent to the normal CMS fetch for lower multisampling modes.
-    */
-   opcode op;
-   if (wm_key->multisample_fbo) {
-      /* On SKL+ use the wide CMS message just in case the framebuffer uses 16x
-       * multisampling, it should be equivalent to the normal CMS fetch for
-       * lower multisampling modes.
-       *
-       * On Gfx12HP, there is only CMS_W variant available.
-       */
-      if (devinfo->verx10 >= 125)
-         op = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
-      else
-         op = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
-   } else {
-      op = SHADER_OPCODE_TXF_LOGICAL;
-   }
-
-   /* Emit the instruction. */
-   brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
-   srcs[TEX_LOGICAL_SRC_COORDINATE]       = coords;
-   srcs[TEX_LOGICAL_SRC_LOD]              = brw_imm_ud(0);
-   srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX]     = sample;
-   srcs[TEX_LOGICAL_SRC_MCS]              = mcs;
-   srcs[TEX_LOGICAL_SRC_SURFACE]          = brw_imm_ud(target);
-   srcs[TEX_LOGICAL_SRC_SAMPLER]          = brw_imm_ud(0);
-
-   brw_tex_inst *tex = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs))->as_tex();
-   tex->size_written = 4 * tex->dst.component_size(tex->exec_size);
-   tex->coord_components = 3;
-
-   return tex;
-}
-
 /**
 * Actual coherent framebuffer read implemented using the native render target
 * read message.  Requires SKL+.
@ -4260,10 +4186,8 @@ brw_from_nir_emit_fs_intrinsic(nir_to_brw_state &ntb,
      const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
      const brw_reg tmp = bld.vgrf(dest.type, 4);

-      if (reinterpret_cast<const brw_wm_prog_key *>(s.key)->coherent_fb_fetch)
-         emit_coherent_fb_read(bld, tmp, target);
-      else
-         emit_non_coherent_fb_read(ntb, bld, tmp, target);
+      assert(reinterpret_cast<const brw_wm_prog_key *>(s.key)->coherent_fb_fetch);
+      emit_coherent_fb_read(bld, tmp, target);

      brw_combine_with_vec(bld, dest,
                           offset(tmp, bld, nir_intrinsic_component(instr)),
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@ -192,6 +192,8 @@ void brw_nir_lower_tcs_outputs(nir_shader *nir,
                               const struct intel_vue_map *vue,
                               enum tess_primitive_mode tes_primitive_mode);
 void brw_nir_lower_fs_outputs(nir_shader *nir);
+bool brw_nir_lower_fs_load_output(nir_shader *shader,
+                                  const struct brw_wm_prog_key *key);

 bool brw_nir_lower_cmat(nir_shader *nir, unsigned subgroup_size);

--- a/src/intel/compiler/brw_nir_lower_fs_load_output.c
+++ b/src/intel/compiler/brw_nir_lower_fs_load_output.c
@ -0,0 +1,61 @@
+/*
+ * Copyright © 2025 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+/**
+ * Lower fragment shader output reads into sampler operations.
+ */
+
+static bool
+brw_nir_lower_fs_load_output_instr(nir_builder *b,
+                                   nir_intrinsic_instr *intrin,
+                                   void *data)
+{
+   if (intrin->intrinsic != nir_intrinsic_load_output)
+      return false;
+
+   const struct brw_wm_prog_key *key = data;
+
+   /* Only used by Iris that never sets this to SOMETIMES */
+   assert(key->multisample_fbo != INTEL_SOMETIMES);
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_def *coords[3] = {
+      nir_f2u32(b, nir_channel(b, nir_load_frag_coord(b), 0)),
+      nir_f2u32(b, nir_channel(b, nir_load_frag_coord(b), 1)),
+      nir_load_layer_id(b),
+   };
+   nir_def *coord = nir_vec(b, coords, 3);
+
+   nir_def *tex =
+      key->multisample_fbo == INTEL_NEVER ?
+      nir_build_tex(b, nir_texop_txf, coord,
+                    .texture_index = nir_intrinsic_base(intrin),
+                    .dim = GLSL_SAMPLER_DIM_2D,
+                    .is_array = true,
+                    .dest_type = nir_type_uint32) :
+      nir_build_tex(b, nir_texop_txf_ms, coord,
+                    .texture_index = nir_intrinsic_base(intrin),
+                    .ms_index = nir_load_sample_id(b),
+                    .dim = GLSL_SAMPLER_DIM_MS,
+                    .dest_type = nir_type_uint32);
+
+   nir_def_replace(&intrin->def, tex);
+
+   return true;
+}
+
+bool
+brw_nir_lower_fs_load_output(nir_shader *shader,
+                             const struct brw_wm_prog_key *key)
+{
+   return nir_shader_intrinsics_pass(shader,
+                                     brw_nir_lower_fs_load_output_instr,
+                                     nir_metadata_control_flow,
+                                     (void *) key);
+}
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@ -73,6 +73,7 @@ libintel_compiler_brw_files = files(
  'brw_nir_lower_cs_intrinsics.c',
  'brw_nir_lower_alpha_to_coverage.c',
  'brw_nir_lower_fs_barycentrics.c',
+  'brw_nir_lower_fs_load_output.c',
  'brw_nir_lower_immediate_offsets.c',
  'brw_nir_lower_intersection_shader.c',
  'brw_nir_lower_ray_queries.c',