asahi: Implement color masks with masked stores

Blend states can require masking colour. Currently, this is handled by nir_lower_blend, which lowers masks to a read-modify-write operation as required on Mali hardware. However, our "tilebuffer store" instruction supports a write mask, allowing us to write only a subset of channels to the tilebuffer. It's more efficient to use that than to emit pointless tilebuffer loads. Note that even without tilebuffer loads, non-opaque masks don't work with opaque pass types. Here, we handle this with a translucent pass type, which gets HSR to do the right thing and is consistent with the pass type used previously. However, it's a bit heavy handed -- Apple manages to use an opaque pass type with masking but with some unknown HSR fields twiddled. IMO reverse-engineering those details shouldn't block this because this gets us closer to optimal (just not all the way there) and is strictly better than what we had before. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21431>
2026-05-08 13:28:06 +02:00 · 2023-02-17 18:24:38 -05:00 · 2023-02-17 18:24:38 -05:00 · 029c686c6d
commit 029c686c6d
parent 3084e6e689
5 changed files with 94 additions and 12 deletions
--- a/src/asahi/lib/agx_meta.c
+++ b/src/asahi/lib/agx_meta.c
@ -19,7 +19,7 @@ agx_compile_meta_shader(struct agx_meta_cache *cache, nir_shader *shader,

   agx_preprocess_nir(shader);
   if (tib)
-      agx_nir_lower_tilebuffer(shader, tib);
+      agx_nir_lower_tilebuffer(shader, tib, NULL, NULL);

   struct agx_meta_shader *res = rzalloc(cache->ht, struct agx_meta_shader);
   agx_compile_shader_nir(shader, key, NULL, &binary, &res->info);
--- a/src/asahi/lib/agx_nir_lower_tilebuffer.c
+++ b/src/asahi/lib/agx_nir_lower_tilebuffer.c
@ -3,6 +3,7 @@
 * SPDX-License-Identifier: MIT
 */

+#include "compiler/agx_internal_formats.h"
 #include "agx_nir_format_helpers.h"
 #include "agx_tilebuffer.h"
 #include "nir.h"
@ -10,6 +11,12 @@

 #define ALL_SAMPLES 0xFF

+struct ctx {
+   struct agx_tilebuffer_layout *tib;
+   uint8_t *colormasks;
+   bool *translucent;
+};
+
 static bool
 tib_filter(const nir_instr *instr, UNUSED const void *_)
 {
@ -29,7 +36,8 @@ tib_filter(const nir_instr *instr, UNUSED const void *_)
 static nir_ssa_def *
 tib_impl(nir_builder *b, nir_instr *instr, void *data)
 {
-   struct agx_tilebuffer_layout *tib = data;
+   struct ctx *ctx = data;
+   struct agx_tilebuffer_layout *tib = ctx->tib;
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);

   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
@ -41,10 +49,38 @@ tib_impl(nir_builder *b, nir_instr *instr, void *data)
   unsigned comps = util_format_get_nr_components(logical_format);

   if (intr->intrinsic == nir_intrinsic_store_output) {
+      /* Only write components that actually exist */
+      uint16_t write_mask = BITFIELD_MASK(comps);
+
      /* Delete stores to nonexistant render targets */
      if (logical_format == PIPE_FORMAT_NONE)
         return NIR_LOWER_INSTR_PROGRESS_REPLACE;

+      /* Only write colours masked by the blend state */
+      if (ctx->colormasks)
+         write_mask &= ctx->colormasks[rt];
+
+      /* Masked stores require a translucent pass type */
+      if (write_mask != BITFIELD_MASK(comps)) {
+         assert(ctx->translucent != NULL &&
+                "colour masking requires translucency");
+
+         assert(agx_internal_format_supports_mask(format) &&
+                "write mask but format cannot be masked");
+
+         *(ctx->translucent) = true;
+      }
+
+      /* But we ignore the NIR write mask for that, since it's basically an
+       * optimization hint.
+       */
+      if (agx_internal_format_supports_mask(format))
+         write_mask &= nir_intrinsic_write_mask(intr);
+
+      /* Delete stores that are entirely masked out */
+      if (!write_mask)
+         return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+
      nir_ssa_def *value = intr->src[0].ssa;

      /* Trim to format as required by hardware */
@ -60,11 +96,9 @@ tib_impl(nir_builder *b, nir_instr *instr, void *data)
            value = nir_f2f32(b, value);
      }

-      nir_store_local_pixel_agx(
-         b, value, nir_imm_intN_t(b, ALL_SAMPLES, 16),
-         .base = tib->offset_B[rt],
-         .write_mask = nir_intrinsic_write_mask(intr) & BITFIELD_MASK(comps),
-         .format = format);
+      nir_store_local_pixel_agx(b, value, nir_imm_intN_t(b, ALL_SAMPLES, 16),
+                                .base = tib->offset_B[rt],
+                                .write_mask = write_mask, .format = format);

      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
   } else {
@ -101,8 +135,16 @@ tib_impl(nir_builder *b, nir_instr *instr, void *data)
 }

 bool
-agx_nir_lower_tilebuffer(nir_shader *shader, struct agx_tilebuffer_layout *tib)
+agx_nir_lower_tilebuffer(nir_shader *shader, struct agx_tilebuffer_layout *tib,
+                         uint8_t *colormasks, bool *translucent)
 {
   assert(shader->info.stage == MESA_SHADER_FRAGMENT);
-   return nir_shader_lower_instructions(shader, tib_filter, tib_impl, tib);
+
+   struct ctx ctx = {
+      .tib = tib,
+      .colormasks = colormasks,
+      .translucent = translucent,
+   };
+
+   return nir_shader_lower_instructions(shader, tib_filter, tib_impl, &ctx);
 }
--- a/src/asahi/lib/agx_tilebuffer.c
+++ b/src/asahi/lib/agx_tilebuffer.c
@ -5,6 +5,7 @@

 #include "agx_tilebuffer.h"
 #include <assert.h>
+#include "compiler/agx_internal_formats.h"
 #include "util/format/u_format.h"
 #include "agx_formats.h"
 #include "agx_usc.h"
@ -77,6 +78,13 @@ agx_tilebuffer_physical_format(struct agx_tilebuffer_layout *tib, unsigned rt)
   return agx_pixel_format[tib->logical_format[rt]].internal;
 }

+bool
+agx_tilebuffer_supports_mask(struct agx_tilebuffer_layout *tib, unsigned rt)
+{
+   enum pipe_format fmt = agx_tilebuffer_physical_format(tib, rt);
+   return agx_internal_format_supports_mask((enum agx_internal_formats)fmt);
+}
+
 static unsigned
 agx_shared_layout_from_tile_size(struct agx_tile_size t)
 {
--- a/src/asahi/lib/agx_tilebuffer.h
+++ b/src/asahi/lib/agx_tilebuffer.h
@ -47,7 +47,8 @@ agx_build_tilebuffer_layout(enum pipe_format *formats, uint8_t nr_cbufs,
                            uint8_t nr_samples);

 bool agx_nir_lower_tilebuffer(struct nir_shader *shader,
-                              struct agx_tilebuffer_layout *tib);
+                              struct agx_tilebuffer_layout *tib,
+                              uint8_t *colormasks, bool *translucent);

 void agx_usc_tilebuffer(struct agx_usc_builder *b,
                        struct agx_tilebuffer_layout *tib);
@ -57,6 +58,9 @@ uint32_t agx_tilebuffer_total_size(struct agx_tilebuffer_layout *tib);
 enum pipe_format
 agx_tilebuffer_physical_format(struct agx_tilebuffer_layout *tib, unsigned rt);

+bool agx_tilebuffer_supports_mask(struct agx_tilebuffer_layout *tib,
+                                  unsigned rt);
+
 #ifdef __cplusplus
 } /* extern C */
 #endif
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@ -1370,6 +1370,8 @@ agx_compile_variant(struct agx_device *dev, struct agx_uncompiled_shader *so,

   nir_shader *nir = nir_shader_clone(NULL, so->nir);

+   bool force_translucent = false;
+
   if (nir->info.stage == MESA_SHADER_VERTEX) {
      struct asahi_vs_shader_key *key = &key_->vs;

@ -1393,9 +1395,25 @@ agx_compile_variant(struct agx_device *dev, struct agx_uncompiled_shader *so,
         opts.format[i] = key->rt_formats[i];

      memcpy(opts.rt, key->blend.rt, sizeof(opts.rt));
-      NIR_PASS_V(nir, nir_lower_blend, &opts);

-      NIR_PASS_V(nir, agx_nir_lower_tilebuffer, &tib);
+      /* It's more efficient to use masked stores (with
+       * agx_nir_lower_tilebuffer) than to emulate colour masking with
+       * nir_lower_blend.
+       */
+      uint8_t colormasks[PIPE_MAX_COLOR_BUFS] = {0};
+
+      for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) {
+         if (agx_tilebuffer_supports_mask(&tib, i)) {
+            colormasks[i] = key->blend.rt[i].colormask;
+            opts.rt[i].colormask = BITFIELD_MASK(4);
+         } else {
+            colormasks[i] = BITFIELD_MASK(4);
+         }
+      }
+
+      NIR_PASS_V(nir, nir_lower_blend, &opts);
+      NIR_PASS_V(nir, agx_nir_lower_tilebuffer, &tib, colormasks,
+                 &force_translucent);

      if (key->sprite_coord_enable) {
         NIR_PASS_V(nir, nir_lower_texcoord_replace_late,
@ -1415,6 +1433,16 @@ agx_compile_variant(struct agx_device *dev, struct agx_uncompiled_shader *so,

   agx_compile_shader_nir(nir, &base_key, debug, &binary, &compiled->info);

+   /* reads_tib => Translucent pass type */
+   compiled->info.reads_tib |= force_translucent;
+
+   /* Could be optimized to use non-translucent pass types with the appropriate
+    * HSR configuration, but that mechanism is not yet understood. Warn that
+    * we're leaving perf on the table when used.
+    */
+   if (force_translucent)
+      perf_debug(dev, "Translucency forced due to colour masking");
+
   if (binary.size) {
      compiled->bo = agx_bo_create(dev, binary.size,
                                   AGX_BO_EXEC | AGX_BO_LOW_VA, "Executable");