panfrost: Preprocess shaders in the driver

This is a flag-day change to how we compile. We split preprocessing NIR into a separate step from compiling, giving the driver a chance to apply its own lowerings on the preprocessed NIR before the final optimization loop. During that time, the different producers of NIR (panfrost, panvk, blend shaders, blit shaders...) will be able to (differently) lower system values. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20906>
2026-05-08 19:58:09 +02:00 · 2023-02-06 17:23:19 -05:00 · 2023-02-06 17:23:19 -05:00 · ca2042f359
commit ca2042f359
parent 2a356cefba
15 changed files with 133 additions and 66 deletions
--- a/src/gallium/drivers/panfrost/pan_shader.c
+++ b/src/gallium/drivers/panfrost/pan_shader.c
@ -77,6 +77,15 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,

   nir_shader *s = nir_shader_clone(NULL, ir);

+   /* While graphics shaders are preprocessed at CSO create time, compute
+    * kernels are not preprocessed until they're cloned since the driver does
+    * not get ownership of the NIR from compute CSOs. Do this preprocessing now.
+    * Compute CSOs call this function during create time, so preprocessing
+    * happens at CSO create time regardless.
+    */
+   if (gl_shader_stage_is_compute(s->info.stage))
+      pan_shader_preprocess(s, dev->gpu_id);
+
   struct panfrost_compile_inputs inputs = {
      .debug = dbg,
      .gpu_id = dev->gpu_id,
@ -109,6 +118,14 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,
   }

   util_dynarray_init(&out->binary, NULL);
+   pan_shader_preprocess(s, inputs.gpu_id);
+
+   if (dev->arch <= 5 && s->info.stage == MESA_SHADER_FRAGMENT) {
+      NIR_PASS_V(s, pan_lower_framebuffer, key->fs.rt_formats,
+                 pan_raw_format_mask_midgard(key->fs.rt_formats), false,
+                 dev->gpu_id < 0x700);
+   }
+
   screen->vtbl.compile_shader(s, &inputs, &out->binary, &out->info);

   assert(req_local_mem >= out->info.wls_size);
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@ -4716,7 +4716,7 @@ bi_lower_sample_mask_writes(nir_builder *b, nir_instr *instr, void *data)
 }

 static bool
-bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data)
+bi_lower_load_output(nir_builder *b, nir_instr *instr, UNUSED void *data)
 {
   if (instr->type != nir_instr_type_intrinsic)
      return false;
@ -4734,15 +4734,6 @@ bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data)
   nir_ssa_def *conversion = nir_load_rt_conversion_pan(
      b, .base = rt, .src_type = nir_intrinsic_dest_type(intr));

-   /* TODO: This should be optimized/lowered by the driver */
-   const struct panfrost_compile_inputs *inputs = data;
-
-   if (inputs->is_blend) {
-      conversion = nir_imm_int(b, inputs->blend.bifrost_blend_desc >> 32);
-   } else if (inputs->bifrost.static_rt_conv) {
-      conversion = nir_imm_int(b, inputs->bifrost.rt_conv[rt]);
-   }
-
   nir_ssa_def *lowered = nir_load_converted_output_pan(
      b, nir_dest_num_components(intr->dest), nir_dest_bit_size(intr->dest),
      conversion, .dest_type = nir_intrinsic_dest_type(intr),
@ -4753,8 +4744,7 @@ bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data)
 }

 void
-bifrost_preprocess_nir(nir_shader *nir,
-                       const struct panfrost_compile_inputs *inputs)
+bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
 {
   /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
    * (so we don't accidentally duplicate the epilogue since mesa/st has
@ -4781,7 +4771,7 @@ bifrost_preprocess_nir(nir_shader *nir,
    * (currently unconditional for Valhall), we force vec4 alignment for
    * scratch access.
    */
-   bool packed_tls = (inputs->gpu_id >= 0x9000);
+   bool packed_tls = (gpu_id >= 0x9000);

   /* Lower large arrays to scratch and small arrays to bcsel */
   NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
@ -4810,10 +4800,9 @@ bifrost_preprocess_nir(nir_shader *nir,
                 nir_metadata_block_index | nir_metadata_dominance, NULL);

      NIR_PASS_V(nir, nir_shader_instructions_pass, bi_lower_load_output,
-                 nir_metadata_block_index | nir_metadata_dominance,
-                 (void *)inputs);
+                 nir_metadata_block_index | nir_metadata_dominance, NULL);
   } else if (nir->info.stage == MESA_SHADER_VERTEX) {
-      if (inputs->gpu_id >= 0x9000) {
+      if (gpu_id >= 0x9000) {
         NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
                    BITFIELD64_BIT(VARYING_SLOT_PSIZ), false);
      }
@ -5251,7 +5240,6 @@ bifrost_compile_shader_nir(nir_shader *nir,
 {
   bifrost_debug = debug_get_option_bifrost_debug();

-   bifrost_preprocess_nir(nir, inputs);
   bi_optimize_nir(nir, inputs->gpu_id, inputs->is_blend);

   struct hash_table_u64 *sysval_to_id =
--- a/src/panfrost/compiler/bifrost_compile.h
+++ b/src/panfrost/compiler/bifrost_compile.h
@ -28,8 +28,7 @@
 #include "panfrost/util/pan_ir.h"
 #include "util/u_dynarray.h"

-void bifrost_preprocess_nir(nir_shader *nir,
-                            const struct panfrost_compile_inputs *inputs);
+void bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id);

 void bifrost_compile_shader_nir(nir_shader *nir,
                                const struct panfrost_compile_inputs *inputs,
--- a/src/panfrost/lib/pan_blend.c
+++ b/src/panfrost/lib/pan_blend.c
@ -763,6 +763,42 @@ GENX(pan_blend_get_internal_desc)(const struct panfrost_device *dev,

   return res;
 }
+
+struct rt_conversion_inputs {
+   const struct panfrost_device *dev;
+   enum pipe_format *formats;
+};
+
+static bool
+inline_rt_conversion(nir_builder *b, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   if (intr->intrinsic != nir_intrinsic_load_rt_conversion_pan)
+      return false;
+
+   struct rt_conversion_inputs *inputs = data;
+   unsigned rt = nir_intrinsic_base(intr);
+   unsigned size = nir_alu_type_get_type_size(nir_intrinsic_src_type(intr));
+   uint64_t conversion = GENX(pan_blend_get_internal_desc)(
+      inputs->dev, inputs->formats[rt], rt, size, false);
+
+   b->cursor = nir_after_instr(instr);
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_imm_int(b, conversion >> 32));
+   return true;
+}
+
+bool
+GENX(pan_inline_rt_conversion)(nir_shader *s, const struct panfrost_device *dev,
+                               enum pipe_format *formats)
+{
+   return nir_shader_instructions_pass(
+      s, inline_rt_conversion,
+      nir_metadata_block_index | nir_metadata_dominance,
+      &(struct rt_conversion_inputs){.dev = dev, .formats = formats});
+}
 #endif

 struct pan_blend_shader_variant *
@ -843,6 +879,11 @@ GENX(pan_blend_get_shader_locked)(const struct panfrost_device *dev,
 #endif

   struct pan_shader_info info;
+   pan_shader_preprocess(nir, inputs.gpu_id);
+
+#if PAN_ARCH >= 6
+   NIR_PASS_V(nir, GENX(pan_inline_rt_conversion), dev, inputs.rt_formats);
+#endif

   GENX(pan_shader_compile)(nir, &inputs, &variant->binary, &info);

--- a/src/panfrost/lib/pan_blend.h
+++ b/src/panfrost/lib/pan_blend.h
@ -161,6 +161,10 @@ nir_shader *GENX(pan_blend_create_shader)(const struct panfrost_device *dev,
 uint64_t GENX(pan_blend_get_internal_desc)(const struct panfrost_device *dev,
                                           enum pipe_format fmt, unsigned rt,
                                           unsigned force_size, bool dithered);
+
+bool GENX(pan_inline_rt_conversion)(nir_shader *s,
+                                    const struct panfrost_device *dev,
+                                    enum pipe_format *formats);
 #endif

 /* Take blend_shaders.lock before calling this function and release it when
--- a/src/panfrost/lib/pan_blitter.c
+++ b/src/panfrost/lib/pan_blitter.c
@ -631,6 +631,8 @@ pan_blitter_get_blit_shader(struct panfrost_device *dev,
   for (unsigned i = 0; i < active_count; ++i)
      BITSET_SET(b.shader->info.textures_used, i);

+   pan_shader_preprocess(b.shader, inputs.gpu_id);
+
   if (PAN_ARCH == 4) {
      NIR_PASS_V(b.shader, nir_shader_instructions_pass,
                 lower_sampler_parameters,
--- a/src/panfrost/lib/pan_indirect_dispatch.c
+++ b/src/panfrost/lib/pan_indirect_dispatch.c
@ -130,6 +130,7 @@ pan_indirect_dispatch_init(struct panfrost_device *dev)
   struct util_dynarray binary;

   util_dynarray_init(&binary, NULL);
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
   GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader_info);

   ralloc_free(b.shader);
--- a/src/panfrost/lib/pan_shader.c
+++ b/src/panfrost/lib/pan_shader.c
@ -83,6 +83,27 @@ GENX(pan_fixup_blend_type)(nir_alu_type T_size, enum pipe_format format)
 #endif
 #endif

+/* This is only needed on Midgard. It's the same on both v4 and v5, so only
+ * compile once to avoid the GenXML dependency for calls.
+ */
+#if PAN_ARCH == 5
+uint8_t
+pan_raw_format_mask_midgard(enum pipe_format *formats)
+{
+   uint8_t out = 0;
+
+   for (unsigned i = 0; i < 8; i++) {
+      enum pipe_format fmt = formats[i];
+      unsigned wb_fmt = panfrost_blendable_formats_v6[fmt].writeback;
+
+      if (wb_fmt < MALI_COLOR_FORMAT_R8)
+         out |= BITFIELD_BIT(i);
+   }
+
+   return out;
+}
+#endif
+
 void
 GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs,
                         struct util_dynarray *binary,
@ -93,14 +114,6 @@ GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs,
 #if PAN_ARCH >= 6
   bifrost_compile_shader_nir(s, inputs, binary, info);
 #else
-   for (unsigned i = 0; i < ARRAY_SIZE(inputs->rt_formats); i++) {
-      enum pipe_format fmt = inputs->rt_formats[i];
-      unsigned wb_fmt = panfrost_blendable_formats_v6[fmt].writeback;
-
-      if (wb_fmt < MALI_COLOR_FORMAT_R8)
-         inputs->raw_fmt_mask |= BITFIELD_BIT(i);
-   }
-
   midgard_compile_shader_nir(s, inputs, binary, info);
 #endif

--- a/src/panfrost/lib/pan_shader.h
+++ b/src/panfrost/lib/pan_shader.h
@ -34,6 +34,20 @@

 struct panfrost_device;

+void bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id);
+void midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id);
+
+static inline void
+pan_shader_preprocess(nir_shader *nir, unsigned gpu_id)
+{
+   if (pan_arch(gpu_id) >= 6)
+      bifrost_preprocess_nir(nir, gpu_id);
+   else
+      midgard_preprocess_nir(nir, gpu_id);
+}
+
+uint8_t pan_raw_format_mask_midgard(enum pipe_format *formats);
+
 #ifdef PAN_ARCH
 const nir_shader_compiler_options *GENX(pan_shader_get_compiler_options)(void);

--- a/src/panfrost/midgard/midgard_compile.c
+++ b/src/panfrost/midgard/midgard_compile.c
@ -40,7 +40,6 @@
 #include "util/u_dynarray.h"
 #include "util/u_math.h"

-#include "panfrost/util/pan_lower_framebuffer.h"
 #include "compiler.h"
 #include "helpers.h"
 #include "midgard.h"
@ -330,10 +329,9 @@ midgard_vectorize_filter(const nir_instr *instr, const void *data)
 }

 void
-midgard_preprocess_nir(nir_shader *nir,
-                       const struct panfrost_compile_inputs *inputs)
+midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id)
 {
-   unsigned quirks = midgard_get_quirks(inputs->gpu_id);
+   unsigned quirks = midgard_get_quirks(gpu_id);

   /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
    * (so we don't accidentally duplicate the epilogue since mesa/st has
@ -391,10 +389,9 @@ midgard_preprocess_nir(nir_shader *nir,
   NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);

   /* TEX_GRAD fails to apply sampler descriptor settings on some
-    * implementations, requiring a lowering. However, blit shaders do not
-    * use the affected settings and should skip the workaround.
+    * implementations, requiring a lowering.
    */
-   if ((quirks & MIDGARD_BROKEN_LOD) && !inputs->is_blit)
+   if (quirks & MIDGARD_BROKEN_LOD)
      NIR_PASS_V(nir, midgard_nir_lod_errata);

   /* Midgard image ops coordinates are 16-bit instead of 32-bit */
@ -417,12 +414,6 @@ midgard_preprocess_nir(nir_shader *nir,
   NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL);
   NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */);
   NIR_PASS_V(nir, nir_lower_var_copies);
-
-   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-      NIR_PASS_V(nir, pan_lower_framebuffer, inputs->rt_formats,
-                 inputs->raw_fmt_mask, inputs->is_blend,
-                 quirks & MIDGARD_BROKEN_BLEND_LOADS);
-   }
 }

 static void
@ -3177,8 +3168,6 @@ midgard_compile_shader_nir(nir_shader *nir,

   ctx->ssa_constants = _mesa_hash_table_u64_create(ctx);

-   midgard_preprocess_nir(nir, inputs);
-
   /* Collect varyings after lowering I/O */
   pan_nir_collect_varyings(nir, info);

--- a/src/panfrost/midgard/midgard_compile.h
+++ b/src/panfrost/midgard/midgard_compile.h
@ -29,8 +29,7 @@
 #include "panfrost/util/pan_ir.h"
 #include "util/u_dynarray.h"

-void midgard_preprocess_nir(nir_shader *nir,
-                            const struct panfrost_compile_inputs *inputs);
+void midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id);

 void midgard_compile_shader_nir(nir_shader *nir,
                                const struct panfrost_compile_inputs *inputs,
--- a/src/panfrost/util/pan_ir.h
+++ b/src/panfrost/util/pan_ir.h
@ -184,7 +184,6 @@ struct panfrost_compile_inputs {
   bool no_ubo_to_push;

   enum pipe_format rt_formats[8];
-   uint8_t raw_fmt_mask;

   /* Used on Valhall.
    *
@ -198,7 +197,6 @@ struct panfrost_compile_inputs {

   union {
      struct {
-         bool static_rt_conv;
         uint32_t rt_conv[8];
      } bifrost;
   };
--- a/src/panfrost/vulkan/panvk_vX_meta_clear.c
+++ b/src/panfrost/vulkan/panvk_vX_meta_clear.c
@ -61,6 +61,7 @@ panvk_meta_clear_color_attachment_shader(struct panfrost_device *pdev,
   struct util_dynarray binary;

   util_dynarray_init(&binary, NULL);
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
   GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);

   shader_info->push.count = 4;
--- a/src/panfrost/vulkan/panvk_vX_meta_copy.c
+++ b/src/panfrost/vulkan/panvk_vX_meta_copy.c
@ -449,17 +449,11 @@ panvk_meta_copy_img2img_shader(struct panfrost_device *pdev,
      .no_ubo_to_push = true,
   };

-   pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) {
-      cfg.memory_format = (dstcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12;
-      cfg.register_format = dstcompsz == 2 ?
-                            MALI_REGISTER_FILE_FORMAT_U16 :
-                            MALI_REGISTER_FILE_FORMAT_U32;
-   }
-   inputs.bifrost.static_rt_conv = true;
-
   struct util_dynarray binary;

   util_dynarray_init(&binary, NULL);
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
+   NIR_PASS_V(b.shader, GENX(pan_inline_rt_conversion), pdev, &dstfmt);
   GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);

   shader_info->fs.sample_shading = is_ms;
@ -984,17 +978,14 @@ panvk_meta_copy_buf2img_shader(struct panfrost_device *pdev,
      .no_ubo_to_push = true,
   };

-   pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) {
-      cfg.memory_format = (imgcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12;
-      cfg.register_format = imgcompsz == 2 ?
-                            MALI_REGISTER_FILE_FORMAT_U16 :
-                            MALI_REGISTER_FILE_FORMAT_U32;
-   }
-   inputs.bifrost.static_rt_conv = true;
-
   struct util_dynarray binary;

   util_dynarray_init(&binary, NULL);
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
+
+   enum pipe_format rt_formats[8] = {key.imgfmt};
+   NIR_PASS_V(b.shader, GENX(pan_inline_rt_conversion), pdev, rt_formats);
+
   GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
   shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_buf2img_info), 4);

@ -1434,6 +1425,7 @@ panvk_meta_copy_img2buf_shader(struct panfrost_device *pdev,
   struct util_dynarray binary;

   util_dynarray_init(&binary, NULL);
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
   GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);

   shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_img2buf_info), 4);
@ -1662,6 +1654,7 @@ panvk_meta_copy_buf2buf_shader(struct panfrost_device *pdev,
   struct util_dynarray binary;

   util_dynarray_init(&binary, NULL);
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
   GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);

   shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_buf2buf_info), 4);
@ -1791,6 +1784,7 @@ panvk_meta_fill_buf_shader(struct panfrost_device *pdev,
   struct util_dynarray binary;

   util_dynarray_init(&binary, NULL);
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
   GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);

   shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_fill_buf_info), 4);
--- a/src/panfrost/vulkan/panvk_vX_shader.c
+++ b/src/panfrost/vulkan/panvk_vX_shader.c
@ -153,10 +153,6 @@ panvk_lower_blend(struct panfrost_device *pdev,
      rt_state->equation.alpha_dst_factor = BLEND_FACTOR_ZERO;
      rt_state->equation.alpha_invert_dst_factor = false;
      lower_blend = true;
-
-      inputs->bifrost.static_rt_conv = true;
-      inputs->bifrost.rt_conv[rt] =
-         GENX(pan_blend_get_internal_desc)(pdev, fmt, rt, 32, false) >> 32;
   }

   if (lower_blend) {
@ -371,6 +367,17 @@ panvk_per_arch(shader_create)(struct panvk_device *dev,
      nir_print_shader(nir, stderr);
   }

+   pan_shader_preprocess(nir, inputs.gpu_id);
+
+   if (stage == MESA_SHADER_FRAGMENT) {
+      enum pipe_format rt_formats[MAX_RTS] = {PIPE_FORMAT_NONE};
+
+      for (unsigned rt = 0; rt < MAX_RTS; ++rt)
+         rt_formats[rt] = blend_state->rts[rt].format;
+
+      NIR_PASS_V(nir, GENX(pan_inline_rt_conversion), pdev, rt_formats);
+   }
+
   GENX(pan_shader_compile)(nir, &inputs, &shader->binary, &shader->info);

   /* System values shouldn't have changed */