asahi: implement VBO robustness

GL semantics. GLES (weaker) and VK (stronger) semantics are left as a todo, with explanations given. Enabled always to deal with null VBOs, this should be optimized once we have soft fault. This necessitates a rework of VBO keys, but hopefully for the best. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27616>
2026-02-19 01:40:43 +01:00 · 2024-01-06 09:26:03 -04:00 · 2024-01-06 09:26:03 -04:00 · 5dc0f5ccba
commit 5dc0f5ccba
parent 4aadf67523
6 changed files with 152 additions and 28 deletions
--- a/src/asahi/lib/agx_nir_lower_vbo.c
+++ b/src/asahi/lib/agx_nir_lower_vbo.c
@ -171,7 +171,35 @@ pass(struct nir_builder *b, nir_instr *instr, void *data)
      el = nir_load_vertex_id(b);
   }

-   nir_def *base = nir_load_vbo_base_agx(b, nir_imm_int(b, attrib.buf));
+   /* VBO bases are per-attribute, otherwise they're per-buffer. This allows
+    * memory sinks to work properly with robustness, allows folding
+    * the src_offset into the VBO base to save an add in the shader, and reduces
+    * the size of the vertex fetch key. That last piece allows reusing a linked
+    * VS with both separate and interleaved attributes.
+    */
+   nir_def *buf_handle = nir_imm_int(b, index);
+
+   /* Robustness is handled at the ID level */
+   nir_def *bounds = nir_load_attrib_clamp_agx(b, buf_handle);
+
+   /* For now, robustness is always applied. This gives GL robustness semantics.
+    * For robustBufferAccess2, we'll want to check for out-of-bounds access
+    * (where el > bounds), and replace base with the address of a zero sink.
+    * With soft fault and a large enough sink, we don't need to clamp the index,
+    * allowing that robustness behaviour to be implemented in 2 cmpsel
+    * before the load. That is faster than the 4 cmpsel required after the load,
+    * and it avoids waiting on the load which should help prolog performance.
+    *
+    * TODO: Plumb through soft fault information to skip this.
+    *
+    * TODO: Add a knob for robustBufferAccess2 semantics.
+    */
+   bool robust = true;
+   if (robust) {
+      el = nir_umin(b, el, bounds);
+   }
+
+   nir_def *base = nir_load_vbo_base_agx(b, buf_handle);

   assert((stride % interchange_align) == 0 && "must be aligned");
   assert((offset % interchange_align) == 0 && "must be aligned");
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -1839,10 +1839,16 @@ store("agx", [1, 1], [ACCESS, BASE, FORMAT, SIGN_EXTEND])
 # Logical complement of load_front_face, mapping to an AGX system value
 system_value("back_face_agx", 1, bit_sizes=[1, 32])

-# Load the base address of an indexed VBO (for lowering VBOs)
+# Load the base address of an indexed vertex attribute (for lowering).
 intrinsic("load_vbo_base_agx", src_comp=[1], dest_comp=1, bit_sizes=[64],
          flags=[CAN_ELIMINATE, CAN_REORDER])

+# When vertex robustness is enabled, loads the maximum valid attribute index for
+# a given attribute. This is unsigned: the driver ensures that at least one
+# vertex is always valid to load, directing loads to a zero sink if necessary.
+intrinsic("load_attrib_clamp_agx", src_comp=[1], dest_comp=1,
+          bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER])
+
 # Load a driver-internal system value from a given system value set at a given
 # binding within the set. This is used for correctness when lowering things like
 # UBOs with merged shaders.
--- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
+++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
@ -137,8 +137,11 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
      return load_sysval_indirect(b, 1, 16, stage_table(b), &s->sampler_handle,
                                  intr->src[0].ssa);
   case nir_intrinsic_load_vbo_base_agx:
-      return load_sysval_indirect(b, 1, 64, AGX_SYSVAL_TABLE_ROOT, &u->vbo_base,
-                                  intr->src[0].ssa);
+      return load_sysval_indirect(b, 1, 64, AGX_SYSVAL_TABLE_ROOT,
+                                  &u->attrib_base, intr->src[0].ssa);
+   case nir_intrinsic_load_attrib_clamp_agx:
+      return load_sysval_indirect(b, 1, 32, AGX_SYSVAL_TABLE_ROOT,
+                                  &u->attrib_clamp, intr->src[0].ssa);
   case nir_intrinsic_load_blend_const_color_r_float:
      return load_sysval_root(b, 1, 32, &u->blend_constant[0]);
   case nir_intrinsic_load_blend_const_color_g_float:
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@ -53,6 +53,7 @@
 #include "agx_device.h"
 #include "agx_disk_cache.h"
 #include "agx_nir_lower_gs.h"
+#include "agx_nir_lower_vbo.h"
 #include "agx_tilebuffer.h"
 #include "nir_builder.h"
 #include "nir_builder_opcodes.h"
@ -1461,7 +1462,8 @@ agx_create_vertex_elements(struct pipe_context *ctx, unsigned count,
 {
   assert(count <= AGX_MAX_ATTRIBS);

-   struct agx_attribute *attribs = calloc(sizeof(*attribs), AGX_MAX_ATTRIBS);
+   struct agx_vertex_elements *so = calloc(1, sizeof(*so));
+
   for (unsigned i = 0; i < count; ++i) {
      const struct pipe_vertex_element ve = state[i];

@ -1470,16 +1472,17 @@ agx_create_vertex_elements(struct pipe_context *ctx, unsigned count,
      unsigned chan_size = desc->channel[0].size / 8;
      assert((ve.src_offset & (chan_size - 1)) == 0);

-      attribs[i] = (struct agx_attribute){
-         .buf = ve.vertex_buffer_index,
-         .src_offset = ve.src_offset,
+      so->buffers[i] = ve.vertex_buffer_index;
+      so->src_offsets[i] = ve.src_offset;
+
+      so->key[i] = (struct agx_velem_key){
         .stride = ve.src_stride,
         .format = ve.src_format,
         .divisor = ve.instance_divisor,
      };
   }

-   return attribs;
+   return so;
 }

 static void
@ -1836,6 +1839,22 @@ agx_nir_lower_poly_stipple(nir_shader *s)
   return true;
 }

+static bool
+lower_vbo(nir_shader *s, struct agx_velem_key *key)
+{
+   struct agx_attribute out[AGX_MAX_VBUFS];
+
+   for (unsigned i = 0; i < AGX_MAX_VBUFS; ++i) {
+      out[i] = (struct agx_attribute){
+         .divisor = key[i].divisor,
+         .stride = key[i].stride,
+         .format = key[i].format,
+      };
+   }
+
+   return agx_nir_lower_vbo(s, out);
+}
+
 /* Does not take ownership of key. Clones if necessary. */
 static struct agx_compiled_shader *
 agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
@ -1864,7 +1883,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
   if (nir->info.stage == MESA_SHADER_VERTEX) {
      struct asahi_vs_shader_key *key = &key_->vs;

-      NIR_PASS(_, nir, agx_nir_lower_vbo, key->attribs);
+      NIR_PASS(_, nir, lower_vbo, key->attribs);
      NIR_PASS(_, nir, agx_nir_lower_point_size, key->fixed_point_size);

      if (should_lower_clip_m1_1(dev, key->clip_halfz)) {
@ -1881,7 +1900,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
      nir_shader *vs = nir_deserialize(NULL, &agx_nir_options, &vs_reader);

      /* Apply the VS key to the VS before linking it in */
-      NIR_PASS_V(vs, agx_nir_lower_vbo, key->attribs);
+      NIR_PASS_V(vs, lower_vbo, key->attribs);
      NIR_PASS_V(vs, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
      NIR_PASS_V(vs, agx_nir_lower_sysvals, false);

@ -1903,7 +1922,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
      nir_shader *vs = nir_deserialize(NULL, &agx_nir_options, &vs_reader);

      /* Apply the VS key to the VS before linking it in */
-      NIR_PASS(_, vs, agx_nir_lower_vbo, key->attribs);
+      NIR_PASS(_, vs, lower_vbo, key->attribs);
      NIR_PASS(_, vs, agx_nir_lower_ia, &key->ia);

      NIR_PASS(_, vs, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
@ -2245,8 +2264,7 @@ agx_create_shader_state(struct pipe_context *pctx,
      switch (so->type) {
      case PIPE_SHADER_VERTEX: {
         for (unsigned i = 0; i < AGX_MAX_VBUFS; ++i) {
-            key.vs.attribs[i] = (struct agx_attribute){
-               .buf = i,
+            key.vs.attribs[i] = (struct agx_velem_key){
               .stride = 16,
               .format = PIPE_FORMAT_R32G32B32A32_FLOAT,
            };
@ -2409,8 +2427,7 @@ agx_update_vs(struct agx_context *ctx)
         ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded,
   };

-   memcpy(key.attribs, ctx->attributes,
-          sizeof(key.attribs[0]) * AGX_MAX_ATTRIBS);
+   memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs));

   return agx_update_shader(ctx, &ctx->vs, PIPE_SHADER_VERTEX,
                            (union asahi_shader_key *)&key);
@ -2441,8 +2458,7 @@ agx_update_tcs(struct agx_context *ctx, const struct pipe_draw_info *info)
      .index_size_B = info->index_size,
   };

-   memcpy(key.attribs, ctx->attributes,
-          sizeof(key.attribs[0]) * AGX_MAX_ATTRIBS);
+   memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs));

   static_assert(sizeof(key.input_nir_sha1) ==
                    sizeof(ctx->stage[PIPE_SHADER_VERTEX].shader->nir_sha1),
@ -2491,8 +2507,7 @@ agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info,
      .rasterizer_discard = ctx->rast->base.rasterizer_discard,
   };

-   memcpy(key.attribs, ctx->attributes,
-          sizeof(key.attribs[0]) * AGX_MAX_ATTRIBS);
+   memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs));

   static_assert(sizeof(key.input_nir_sha1) ==
                    sizeof(ctx->stage[PIPE_SHADER_VERTEX].shader->nir_sha1),
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@ -100,8 +100,13 @@ struct PACKED agx_draw_uniforms {
   /* Pointers to the system value tables themselves (for indirection) */
   uint64_t tables[AGX_NUM_SYSVAL_TABLES];

-   /* Vertex buffer object bases, if present */
-   uint64_t vbo_base[PIPE_MAX_ATTRIBS];
+   /* Vertex buffer object bases, if present. If vertex robustness is disabled,
+    * attrib_base maps VBOs directly and attrib_max_index is undefined. If
+    * vertex robustness is enabled, attrib_base maps attributes and
+    * attrib_clamp is an inclusive clamp on vertex/divided instance indices.
+    */
+   uint64_t attrib_base[PIPE_MAX_ATTRIBS];
+   uint32_t attrib_clamp[PIPE_MAX_ATTRIBS];

   /* Address of input assembly buffer if geom/tess is used, else 0 */
   uint64_t input_assembly;
@ -400,14 +405,31 @@ struct agx_blend {
   uint32_t store;
 };

+/* These parts of the vertex element affect the generated code */
+struct agx_velem_key {
+   uint32_t divisor;
+   uint16_t stride;
+   uint8_t format;
+   uint8_t pad;
+};
+
 struct asahi_vs_shader_key {
-   struct agx_attribute attribs[AGX_MAX_VBUFS];
+   struct agx_velem_key attribs[AGX_MAX_VBUFS];
   bool clip_halfz;
   bool fixed_point_size;
   uint64_t outputs_flat_shaded;
   uint64_t outputs_linear_shaded;
 };

+struct agx_vertex_elements {
+   unsigned num_attribs;
+   struct agx_velem_key key[PIPE_MAX_ATTRIBS];
+
+   /* These parts do not affect the generated code so are not in the key */
+   uint16_t src_offsets[PIPE_MAX_ATTRIBS];
+   uint16_t buffers[PIPE_MAX_ATTRIBS];
+};
+
 struct asahi_fs_shader_key {
   struct agx_blend_key blend;

@ -429,7 +451,7 @@ struct asahi_tcs_shader_key {
   uint8_t index_size_B;

   /* Vertex shader key */
-   struct agx_attribute attribs[AGX_MAX_VBUFS];
+   struct agx_velem_key attribs[AGX_MAX_VBUFS];

   /* Tessellation control shaders must be linked with a vertex shader. */
   uint8_t input_nir_sha1[20];
@ -440,7 +462,7 @@ struct asahi_gs_shader_key {
   struct agx_ia_key ia;

   /* Vertex shader key */
-   struct agx_attribute attribs[AGX_MAX_VBUFS];
+   struct agx_velem_key attribs[AGX_MAX_VBUFS];

   /* If true, this GS is run only for its side effects (including XFB) */
   bool rasterizer_discard;
@ -561,7 +583,7 @@ struct agx_context {
   float default_inner_level[2];

   struct agx_stage stage[PIPE_SHADER_TYPES];
-   struct agx_attribute *attributes;
+   struct agx_vertex_elements *attributes;
   struct agx_rasterizer *rast;
   struct agx_zsa *zs;
   struct agx_blend *blend;
--- a/src/gallium/drivers/asahi/agx_uniforms.c
+++ b/src/gallium/drivers/asahi/agx_uniforms.c
@ -4,7 +4,9 @@
 */
 #include <stdio.h>
 #include "asahi/lib/agx_pack.h"
+#include "util/format/u_format.h"
 #include "agx_state.h"
+#include "pool.h"

 static uint64_t
 agx_const_buffer_ptr(struct agx_batch *batch, struct pipe_constant_buffer *cb)
@ -38,7 +40,13 @@ void
 agx_upload_vbos(struct agx_batch *batch)
 {
   struct agx_context *ctx = batch->ctx;
+   struct agx_vertex_elements *attribs = ctx->attributes;
+   uint64_t buffers[PIPE_MAX_ATTRIBS] = {0};
+   size_t buf_sizes[PIPE_MAX_ATTRIBS] = {0};

+   /* TODO: To handle null vertex buffers, we use robustness always. Once we
+    * support soft fault in the kernel, we can optimize this.
+    */
   u_foreach_bit(vbo, ctx->vb_mask) {
      struct pipe_vertex_buffer vb = ctx->vertex_buffers[vbo];
      assert(!vb.is_user_buffer);
@ -47,9 +55,51 @@ agx_upload_vbos(struct agx_batch *batch)
         struct agx_resource *rsrc = agx_resource(vb.buffer.resource);
         agx_batch_reads(batch, rsrc);

-         batch->uniforms.vbo_base[vbo] = rsrc->bo->ptr.gpu + vb.buffer_offset;
+         buffers[vbo] = rsrc->bo->ptr.gpu + vb.buffer_offset;
+         buf_sizes[vbo] = rsrc->layout.size_B - vb.buffer_offset;
+      }
+   }
+
+   for (unsigned i = 0; i < PIPE_MAX_ATTRIBS; ++i) {
+      unsigned buffer_size = buf_sizes[attribs->buffers[i]];
+
+      /* Determine the maximum vertex/divided instance index.  For robustness,
+       * the index will be clamped to this before reading (if soft fault is
+       * disabled).
+       *
+       * Index i accesses up to (exclusive) offset:
+       *
+       *    src_offset + (i * stride) + elsize_B
+       *
+       * so we require
+       *
+       *    src_offset + (i * stride) + elsize_B <= size
+       *
+       * <==>
+       *
+       *    i <= floor((size - src_offset - elsize_B) / stride)
+       */
+      unsigned elsize_B = util_format_get_blocksize(attribs->key[i].format);
+      unsigned subtracted = attribs->src_offsets[i] + elsize_B;
+
+      if (buffer_size >= subtracted) {
+         /* At least one index is valid, determine the max. If this is zero,
+          * only 1 index is valid.
+          */
+         unsigned max_index =
+            (buffer_size - subtracted) / attribs->key[i].stride;
+
+         batch->uniforms.attrib_base[i] =
+            buffers[attribs->buffers[i]] + attribs->src_offsets[i];
+
+         batch->uniforms.attrib_clamp[i] = max_index;
      } else {
-         batch->uniforms.vbo_base[vbo] = 0;
+         /* No indices are valid. Direct reads to a single zero. */
+         uint32_t zeroes[4] = {0};
+         uint64_t sink = agx_pool_upload_aligned(&batch->pool, &zeroes, 16, 16);
+
+         batch->uniforms.attrib_base[i] = sink;
+         batch->uniforms.attrib_clamp[i] = 0;
      }
   }
 }