agx: Lower VBOs in NIR

Now we support all the vertex formats! This means we don't hit u_vbuf for format translation, which helps performance in lots of applications. By doing the lowering in NIR, the vertex fetch code itself can be optimized by NIR (e.g. nir_opt_algebraic) which can improve generated code quality. In my first implementation of this, I had a big switch statement mapping format enums to interchange formats and post-processing code. This ends up being really unwieldly, the combinatorics of bit packing + conversion + swizzles is enormous and for performance we want to support everything (no u_vbuf fallbacks). To keep the combinatorics in check, we rely on parsing the util_format_description to separate out the issues of bit packing, conversion, and swizzling, allowing us to handle bizarro formats like B10G10R10A2_SNORM with no special casing. In an effort to support everything in one shot, this handles all the formats needed for the extensions EXT_vertex_array_bgra, ARB_vertex_type_2_10_10_10_rev, and ARB_vertex_type_10f_11f_11f_rev. Passes dEQP-GLES3.functional.vertex_arrays.* Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19996>
2026-01-01 16:10:09 +01:00 · 2022-11-24 20:40:50 -05:00 · 2022-11-24 20:40:50 -05:00 · 8dcf7648f1
commit 8dcf7648f1
parent fb49715a2c
11 changed files with 318 additions and 222 deletions
--- a/docs/features.txt
+++ b/docs/features.txt
@ -108,7 +108,7 @@ GL 3.3, GLSL 3.30 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llv
  GL_ARB_texture_swizzle                                DONE (v3d, vc4, panfrost, lima, asahi)
  GL_ARB_timer_query                                    DONE ()
  GL_ARB_instanced_arrays                               DONE (etnaviv/HALTI2, v3d, panfrost)
-  GL_ARB_vertex_type_2_10_10_10_rev                     DONE (v3d, panfrost)
+  GL_ARB_vertex_type_2_10_10_10_rev                     DONE (v3d, panfrost, asahi)


 GL 4.0, GLSL 4.00 --- all DONE: freedreno/a6xx, i965/gen7+, nvc0, r600, radeonsi, llvmpipe, virgl, zink, d3d12
@ -208,7 +208,7 @@ GL 4.4, GLSL 4.40 -- all DONE: freedreno/a6xx, i965/gen8+, nvc0, r600, radeonsi,
  GL_ARB_query_buffer_object                            DONE (freedreno/a6xx, i965/hsw+, virgl)
  GL_ARB_texture_mirror_clamp_to_edge                   DONE (freedreno, i965, nv50, softpipe, virgl, v3d, panfrost)
  GL_ARB_texture_stencil8                               DONE (freedreno, i965/hsw+, nv50, softpipe, virgl, v3d, panfrost, d3d12, asahi)
-  GL_ARB_vertex_type_10f_11f_11f_rev                    DONE (freedreno, i965, nv50, softpipe, virgl, panfrost, d3d12)
+  GL_ARB_vertex_type_10f_11f_11f_rev                    DONE (freedreno, i965, nv50, softpipe, virgl, panfrost, d3d12, asahi)

 GL 4.5, GLSL 4.50 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, zink

--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@ -358,61 +358,6 @@ agx_format_for_pipe(enum pipe_format format)
   unreachable("Invalid format");
 }

-/* AGX appears to lack support for vertex attributes. Lower to global loads. */
-static void
-agx_emit_load_attr(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
-{
-   nir_src *offset_src = nir_get_io_offset_src(instr);
-   assert(nir_src_is_const(*offset_src) && "no attribute indirects");
-   unsigned index = nir_intrinsic_base(instr) +
-                    nir_src_as_uint(*offset_src);
-
-   struct agx_shader_key *key = b->shader->key;
-   struct agx_attribute attrib = key->vs.attributes[index];
-
-   /* address = base + (stride * vertex_id) + src_offset */
-   unsigned buf = attrib.buf;
-   unsigned stride = key->vs.vbuf_strides[buf];
-   unsigned shift = agx_format_shift(attrib.format);
-
-   agx_index shifted_stride = agx_mov_imm(b, 32, stride >> shift);
-   agx_index src_offset = agx_mov_imm(b, 32, attrib.src_offset);
-
-   /* A nonzero divisor requires dividing the instance ID. A zero divisor
-    * specifies per-instance data. */
-   agx_index element_id = (attrib.divisor == 0) ? agx_vertex_id(b) :
-                          agx_udiv_const(b, agx_instance_id(b), attrib.divisor);
-
-   agx_index offset = agx_imad(b, element_id, shifted_stride, src_offset, 0);
-
-   /* Each VBO has a 64-bit = 4 x 16-bit address, lookup the base address as a
-    * sysval.  Mov around the base to handle uniform restrictions, copyprop will
-    * usually clean that up.
-    */
-   agx_index base = agx_mov(b, agx_vbo_base(b->shader, buf));
-
-   /* Load the data */
-   assert(instr->num_components <= 4);
-
-   unsigned actual_comps = (attrib.nr_comps_minus_1 + 1);
-   agx_index vec = agx_vec_for_dest(b->shader, &instr->dest);
-   agx_device_load_to(b, vec, base, offset, attrib.format,
-                      BITFIELD_MASK(attrib.nr_comps_minus_1 + 1), 0, 0);
-   agx_wait(b, 0);
-
-   agx_index dests[4] = { agx_null() };
-   agx_emit_split(b, dests, vec, actual_comps);
-
-   agx_index one = agx_mov_imm(b, 32, fui(1.0));
-   agx_index zero = agx_mov_imm(b, 32, 0);
-   agx_index default_value[4] = { zero, zero, zero, one };
-
-   for (unsigned i = actual_comps; i < instr->num_components; ++i)
-      dests[i] = default_value[i];
-
-   agx_emit_collect_to(b, dest, instr->num_components, dests);
-}
-
 static void
 agx_emit_load_vary_flat(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
 {
@ -733,13 +678,8 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
     return NULL;

  case nir_intrinsic_load_input:
-     if (stage == MESA_SHADER_FRAGMENT)
-        agx_emit_load_vary_flat(b, dst, instr);
-     else if (stage == MESA_SHADER_VERTEX)
-        agx_emit_load_attr(b, dst, instr);
-     else
-        unreachable("Unsupported shader stage");
-
+     assert(stage == MESA_SHADER_FRAGMENT && "vertex loads lowered");
+     agx_emit_load_vary_flat(b, dst, instr);
     return NULL;

  case nir_intrinsic_load_global:
@ -785,6 +725,10 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
              nir_src_as_uint(instr->src[0]) * 4,
              b->shader->nir->info.num_ubos * 4));

+  case nir_intrinsic_load_vbo_base_agx:
+     return agx_mov_to(b, dst,
+                       agx_vbo_base(b->shader, nir_src_as_uint(instr->src[0])));
+
  case nir_intrinsic_load_vertex_id:
     return agx_mov_to(b, dst, agx_abs(agx_vertex_id(b)));

--- a/src/asahi/compiler/agx_compile.h
+++ b/src/asahi/compiler/agx_compile.h
@ -182,8 +182,6 @@ struct agx_shader_info {
 };

 #define AGX_MAX_RTS (8)
-#define AGX_MAX_ATTRIBS (16)
-#define AGX_MAX_VBUFS (16)

 enum agx_format {
   AGX_FORMAT_I8 = 0,
@ -203,56 +201,6 @@ enum agx_format {
   AGX_NUM_FORMATS,
 };

-/* Returns the number of bits at the bottom of the address required to be zero.
- * That is, returns the base-2 logarithm of the minimum alignment for an
- * agx_format, where the minimum alignment is 2^n where n is the result of this
- * function. The offset argument to device_load is left-shifted by this amount
- * in the hardware */
-
-static inline unsigned
-agx_format_shift(enum agx_format format)
-{
-   switch (format) {
-   case AGX_FORMAT_I8:
-   case AGX_FORMAT_U8NORM:
-   case AGX_FORMAT_S8NORM:
-   case AGX_FORMAT_SRGBA8:
-      return 0;
-
-   case AGX_FORMAT_I16:
-   case AGX_FORMAT_F16:
-   case AGX_FORMAT_U16NORM:
-   case AGX_FORMAT_S16NORM:
-      return 1;
-
-   case AGX_FORMAT_I32:
-   case AGX_FORMAT_RGB10A2:
-   case AGX_FORMAT_RG11B10F:
-   case AGX_FORMAT_RGB9E5:
-      return 2;
-
-   default:
-      unreachable("invalid format");
-   }
-}
-
-struct agx_attribute {
-   uint32_t divisor;
-
-   unsigned buf : 5;
-   unsigned src_offset : 16;
-   unsigned nr_comps_minus_1 : 2;
-   enum agx_format format : 4;
-   unsigned padding : 5;
-};
-
-struct agx_vs_shader_key {
-   unsigned num_vbufs;
-   unsigned vbuf_strides[AGX_MAX_VBUFS];
-
-   struct agx_attribute attributes[AGX_MAX_ATTRIBS];
-};
-
 struct agx_fs_shader_key {
   /* Normally, access to the tilebuffer must be guarded by appropriate fencing
    * instructions to ensure correct results in the presence of out-of-order
@ -269,7 +217,6 @@ struct agx_fs_shader_key {

 struct agx_shader_key {
   union {
-      struct agx_vs_shader_key vs;
      struct agx_fs_shader_key fs;
   };
 };
--- a/src/asahi/lib/agx_formats.c
+++ b/src/asahi/lib/agx_formats.c
@ -190,69 +190,3 @@ const struct agx_pixel_format_entry agx_pixel_format[PIPE_FORMAT_COUNT] = {
   AGX_FMT(BPTC_RGBA_UNORM,         BC7,           UNORM,  F, _),
   AGX_FMT(BPTC_SRGBA,              BC7,           UNORM,  F, _),
 };
-
-const enum agx_format
-agx_vertex_format[PIPE_FORMAT_COUNT] = {
-   [PIPE_FORMAT_R32_FLOAT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32_SINT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32_UINT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32_FLOAT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32_SINT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32_UINT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32B32_FLOAT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32B32_UINT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32B32_SINT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32B32A32_FLOAT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32B32A32_UINT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32B32A32_SINT] = AGX_FORMAT_I32,
-
-   [PIPE_FORMAT_R8_UNORM] = AGX_FORMAT_U8NORM,
-   [PIPE_FORMAT_R8G8_UNORM] = AGX_FORMAT_U8NORM,
-   [PIPE_FORMAT_R8G8B8_UNORM] = AGX_FORMAT_U8NORM,
-   [PIPE_FORMAT_R8G8B8A8_UNORM] = AGX_FORMAT_U8NORM,
-
-   [PIPE_FORMAT_R8_SNORM] = AGX_FORMAT_S8NORM,
-   [PIPE_FORMAT_R8G8_SNORM] = AGX_FORMAT_S8NORM,
-   [PIPE_FORMAT_R8G8B8_SNORM] = AGX_FORMAT_S8NORM,
-   [PIPE_FORMAT_R8G8B8A8_SNORM] = AGX_FORMAT_S8NORM,
-
-   [PIPE_FORMAT_R16_UNORM] = AGX_FORMAT_U16NORM,
-   [PIPE_FORMAT_R16G16_UNORM] = AGX_FORMAT_U16NORM,
-   [PIPE_FORMAT_R16G16B16_UNORM] = AGX_FORMAT_U16NORM,
-   [PIPE_FORMAT_R16G16B16A16_UNORM] = AGX_FORMAT_U16NORM,
-
-   [PIPE_FORMAT_R16_SNORM] = AGX_FORMAT_S16NORM,
-   [PIPE_FORMAT_R16G16_SNORM] = AGX_FORMAT_S16NORM,
-   [PIPE_FORMAT_R16G16B16_SNORM] = AGX_FORMAT_S16NORM,
-   [PIPE_FORMAT_R16G16B16A16_SNORM] = AGX_FORMAT_S16NORM,
-
-   [PIPE_FORMAT_R8_UINT] = AGX_FORMAT_I8,
-   [PIPE_FORMAT_R8G8_UINT] = AGX_FORMAT_I8,
-   [PIPE_FORMAT_R8G8B8_UINT] = AGX_FORMAT_I8,
-   [PIPE_FORMAT_R8G8B8A8_UINT] = AGX_FORMAT_I8,
-
-   [PIPE_FORMAT_R8_SINT] = AGX_FORMAT_I8,
-   [PIPE_FORMAT_R8G8_SINT] = AGX_FORMAT_I8,
-   [PIPE_FORMAT_R8G8B8_SINT] = AGX_FORMAT_I8,
-   [PIPE_FORMAT_R8G8B8A8_SINT] = AGX_FORMAT_I8,
-
-   [PIPE_FORMAT_R16_UINT] = AGX_FORMAT_I16,
-   [PIPE_FORMAT_R16G16_UINT] = AGX_FORMAT_I16,
-   [PIPE_FORMAT_R16G16B16_UINT] = AGX_FORMAT_I16,
-   [PIPE_FORMAT_R16G16B16A16_UINT] = AGX_FORMAT_I16,
-
-   [PIPE_FORMAT_R16_SINT] = AGX_FORMAT_I16,
-   [PIPE_FORMAT_R16G16_SINT] = AGX_FORMAT_I16,
-   [PIPE_FORMAT_R16G16B16_SINT] = AGX_FORMAT_I16,
-   [PIPE_FORMAT_R16G16B16A16_SINT] = AGX_FORMAT_I16,
-
-   [PIPE_FORMAT_R32_UINT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32_UINT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32B32_UINT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32B32A32_UINT] = AGX_FORMAT_I32,
-
-   [PIPE_FORMAT_R32_SINT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32_SINT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32B32_SINT] = AGX_FORMAT_I32,
-   [PIPE_FORMAT_R32G32B32A32_SINT] = AGX_FORMAT_I32,
-};
--- a/src/asahi/lib/agx_formats.h
+++ b/src/asahi/lib/agx_formats.h
@ -26,7 +26,6 @@
 #define __AGX_FORMATS_H_

 #include "util/format/u_format.h"
-#include "asahi/compiler/agx_compile.h"

 struct agx_pixel_format_entry {
   uint8_t channels;
@ -36,7 +35,6 @@ struct agx_pixel_format_entry {
 };

 extern const struct agx_pixel_format_entry agx_pixel_format[PIPE_FORMAT_COUNT];
-extern const enum agx_format agx_vertex_format[PIPE_FORMAT_COUNT];

 /* N.b. hardware=0 corresponds to R8 UNORM, which is renderable. So a zero
 * entry indicates an invalid format. */
--- a/src/asahi/lib/agx_nir_lower_vbo.c
+++ b/src/asahi/lib/agx_nir_lower_vbo.c
@ -0,0 +1,239 @@
+/*
+ * Copyright 2022 Alyssa Rosenzweig
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "agx_nir_lower_vbo.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
+#include "util/u_math.h"
+
+static bool
+is_rgb10_a2(const struct util_format_description *desc)
+{
+   return desc->channel[0].shift ==  0 && desc->channel[0].size == 10 &&
+          desc->channel[1].shift == 10 && desc->channel[1].size == 10 &&
+          desc->channel[2].shift == 20 && desc->channel[2].size == 10 &&
+          desc->channel[3].shift == 30 && desc->channel[3].size == 2;
+}
+
+static enum pipe_format
+agx_vbo_internal_format(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   /* RGB10A2 formats are native for UNORM and unpacked otherwise */
+   if (is_rgb10_a2(desc)) {
+      if (desc->is_unorm)
+         return PIPE_FORMAT_R10G10B10A2_UNORM;
+      else
+         return PIPE_FORMAT_R32_UINT;
+   }
+
+   /* R11G11B10F is native and special */
+   if (format == PIPE_FORMAT_R11G11B10_FLOAT)
+      return format;
+
+   /* No other non-array formats handled */
+   if (!desc->is_array)
+      return PIPE_FORMAT_NONE;
+
+   /* Otherwise look at one (any) channel */
+   int idx = util_format_get_first_non_void_channel(format);
+   if (idx < 0)
+      return PIPE_FORMAT_NONE;
+
+   /* We only handle RGB formats (we could do SRGB if we wanted though?) */
+   if ((desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB) ||
+       (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN))
+      return PIPE_FORMAT_NONE;
+
+   /* We have native 8-bit and 16-bit normalized formats */
+   struct util_format_channel_description chan = desc->channel[idx];
+
+   if (chan.normalized) {
+      if (chan.size == 8)
+         return desc->is_unorm ? PIPE_FORMAT_R8_UNORM : PIPE_FORMAT_R8_SNORM;
+      else if (chan.size == 16)
+         return desc->is_unorm ? PIPE_FORMAT_R16_UNORM : PIPE_FORMAT_R16_SNORM;
+   }
+
+   /* Otherwise map to the corresponding integer format */
+   switch (chan.size) {
+   case 32: return PIPE_FORMAT_R32_UINT;
+   case 16: return PIPE_FORMAT_R16_UINT;
+   case  8: return PIPE_FORMAT_R8_UINT;
+   default: return PIPE_FORMAT_NONE;
+   }
+}
+
+bool
+agx_vbo_supports_format(enum pipe_format format)
+{
+   return agx_vbo_internal_format(format) != PIPE_FORMAT_NONE;
+}
+
+static nir_ssa_def *
+apply_swizzle_channel(nir_builder *b, nir_ssa_def *vec,
+                      unsigned swizzle, bool is_int)
+{
+   switch (swizzle) {
+   case PIPE_SWIZZLE_X: return nir_channel(b, vec, 0);
+   case PIPE_SWIZZLE_Y: return nir_channel(b, vec, 1);
+   case PIPE_SWIZZLE_Z: return nir_channel(b, vec, 2);
+   case PIPE_SWIZZLE_W: return nir_channel(b, vec, 3);
+   case PIPE_SWIZZLE_0: return nir_imm_intN_t(b, 0, vec->bit_size);
+   case PIPE_SWIZZLE_1: return is_int ? nir_imm_intN_t(b, 1, vec->bit_size) :
+                                        nir_imm_floatN_t(b, 1.0, vec->bit_size);
+   default: unreachable("Invalid swizzle channel");
+   }
+}
+
+static bool
+pass(struct nir_builder *b, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   if (intr->intrinsic != nir_intrinsic_load_input)
+      return false;
+
+   struct agx_vbufs *vbufs = data;
+   b->cursor = nir_before_instr(instr);
+
+   nir_src *offset_src = nir_get_io_offset_src(intr);
+   assert(nir_src_is_const(*offset_src) && "no attribute indirects");
+   unsigned index = nir_intrinsic_base(intr) + nir_src_as_uint(*offset_src);
+
+   struct agx_attribute attrib = vbufs->attributes[index];
+   uint32_t stride = vbufs->strides[attrib.buf];
+   uint16_t offset = attrib.src_offset;
+
+   const struct util_format_description *desc =
+      util_format_description(attrib.format);
+   int chan = util_format_get_first_non_void_channel(attrib.format);
+   assert(chan >= 0);
+
+   bool is_float    = desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
+   bool is_unsigned = desc->channel[chan].type == UTIL_FORMAT_TYPE_UNSIGNED;
+   bool is_signed   = desc->channel[chan].type == UTIL_FORMAT_TYPE_SIGNED;
+   bool is_fixed    = desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
+   bool is_int      = util_format_is_pure_integer(attrib.format);
+
+   assert((is_float ^ is_unsigned ^ is_signed ^ is_fixed) && "Invalid format");
+
+   enum pipe_format interchange_format = agx_vbo_internal_format(attrib.format);
+   assert(interchange_format != PIPE_FORMAT_NONE);
+
+   unsigned interchange_align = util_format_get_blocksize(interchange_format);
+   unsigned interchange_comps = util_format_get_nr_components(attrib.format);
+
+   /* In the hardware, uint formats zero-extend and float formats convert.
+    * However, non-uint formats using a uint interchange format shouldn't be
+    * zero extended.
+    */
+   unsigned interchange_register_size =
+      util_format_is_pure_uint(interchange_format) && !util_format_is_pure_uint(attrib.format) ?
+      (interchange_align * 8):
+      nir_dest_bit_size(intr->dest);
+
+   /* Non-UNORM R10G10B10A2 loaded as a scalar and unpacked */
+   if (interchange_format == PIPE_FORMAT_R32_UINT && !desc->is_array)
+      interchange_comps = 1;
+
+   /* Calculate the element to fetch the vertex for. Divide the instance ID by
+    * the divisor for per-instance data. Divisor=0 specifies per-vertex data.
+    */
+   nir_ssa_def *el = (attrib.divisor == 0) ?
+                     nir_load_vertex_id(b) :
+                     nir_udiv_imm(b, nir_load_instance_id(b), attrib.divisor);
+
+   nir_ssa_def *base = nir_load_vbo_base_agx(b, nir_imm_int(b, attrib.buf));
+
+   assert((stride % interchange_align) == 0 && "must be aligned");
+   assert((offset % interchange_align) == 0 && "must be aligned");
+
+   unsigned stride_el = stride / interchange_align;
+   unsigned offset_el = offset / interchange_align;
+
+   nir_ssa_def *stride_offset_el =
+      nir_iadd_imm(b, nir_imul_imm(b, el, stride_el), offset_el);
+
+   /* Load the raw vector */
+   nir_ssa_def *memory =
+      nir_load_constant_agx(b, interchange_comps,
+                            interchange_register_size,
+                            base,
+                            stride_offset_el,
+                            .format = interchange_format);
+
+   unsigned dest_size = nir_dest_bit_size(intr->dest);
+
+   /* Unpack but do not convert non-native non-array formats */
+   if (is_rgb10_a2(desc) && interchange_format == PIPE_FORMAT_R32_UINT) {
+      unsigned bits[] = { 10, 10, 10, 2 };
+
+      if (is_signed)
+         memory = nir_format_unpack_sint(b, memory, bits, 4);
+      else
+         memory = nir_format_unpack_uint(b, memory, bits, 4);
+   }
+
+   if (desc->channel[chan].normalized) {
+      /* 8/16-bit normalized formats are native, others converted here */
+      if (is_rgb10_a2(desc) && is_signed) {
+         unsigned bits[] = { 10, 10, 10, 2 };
+         memory = nir_format_snorm_to_float(b, memory, bits);
+      } else if (desc->channel[chan].size == 32) {
+         assert(desc->is_array && "no non-array 32-bit norm formats");
+         unsigned bits[] = { 32, 32, 32, 32 };
+
+         if (is_signed)
+            memory = nir_format_snorm_to_float(b, memory, bits);
+         else
+            memory = nir_format_unorm_to_float(b, memory, bits);
+      }
+   } else if (desc->channel[chan].pure_integer) {
+      /* Zero-extension is native, may need to sign extend */
+      if (is_signed)
+         memory = nir_i2iN(b, memory, dest_size);
+   } else {
+      if (is_unsigned)
+         memory = nir_u2fN(b, memory, dest_size);
+      else if (is_signed || is_fixed)
+         memory = nir_i2fN(b, memory, dest_size);
+      else
+         memory = nir_f2fN(b, memory, dest_size);
+
+      /* 16.16 fixed-point weirdo GL formats need to be scaled */
+      if (is_fixed) {
+         assert(desc->is_array && desc->channel[chan].size == 32);
+         assert(dest_size == 32 && "overflow if smaller");
+         memory = nir_fmul_imm(b, memory, 1.0 / 65536.0);
+      }
+   }
+
+   /* We now have a properly formatted vector of the components in memory. Apply
+    * the format swizzle forwards to trim/pad/reorder as needed.
+    */
+   nir_ssa_def *channels[4] = { NULL };
+   assert(nir_intrinsic_component(intr) == 0 && "unimplemented");
+
+   for (unsigned i = 0; i < intr->num_components; ++i)
+      channels[i] = apply_swizzle_channel(b, memory, desc->swizzle[i], is_int);
+
+   nir_ssa_def *logical = nir_vec(b, channels, intr->num_components);
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, logical);
+   return true;
+}
+
+bool
+agx_nir_lower_vbo(nir_shader *shader, struct agx_vbufs *vbufs)
+{
+   assert(shader->info.stage == MESA_SHADER_VERTEX);
+   return nir_shader_instructions_pass(shader, pass,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       vbufs);
+}
--- a/src/asahi/lib/agx_nir_lower_vbo.h
+++ b/src/asahi/lib/agx_nir_lower_vbo.h
@ -0,0 +1,46 @@
+/*
+ * Copyright 2022 Alyssa Rosenzweig
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef __AGX_NIR_LOWER_VBO_H
+#define __AGX_NIR_LOWER_VBO_H
+
+#include <stdint.h>
+#include <stdbool.h>
+#include "nir.h"
+#include "util/format/u_formats.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define AGX_MAX_ATTRIBS (16)
+#define AGX_MAX_VBUFS (16)
+
+/* See pipe_vertex_element for justification on the sizes. This structure should
+ * be small so it can be embedded into a shader key.
+ */
+struct agx_attribute {
+   uint32_t divisor;
+   uint16_t src_offset;
+   uint8_t buf;
+
+   /* pipe_format, all vertex formats should be <= 255 */
+   uint8_t format;
+};
+
+struct agx_vbufs {
+   unsigned count;
+   uint32_t strides[AGX_MAX_VBUFS];
+   struct agx_attribute attributes[AGX_MAX_ATTRIBS];
+};
+
+bool agx_nir_lower_vbo(nir_shader *shader, struct agx_vbufs *vbufs);
+bool agx_vbo_supports_format(enum pipe_format format);
+
+#ifdef __cplusplus
+} /* extern C */
+#endif
+
+#endif
--- a/src/asahi/lib/meson.build
+++ b/src/asahi/lib/meson.build
@ -27,6 +27,7 @@ libasahi_lib_files = files(
  'agx_meta.c',
  'agx_tilebuffer.c',
  'agx_nir_lower_tilebuffer.c',
+  'agx_nir_lower_vbo.c',
  'agx_ppp.h',
  'pool.c',
 )
--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@ -1573,18 +1573,8 @@ agx_is_format_supported(struct pipe_screen* pscreen,
         return false;
   }

-   /* TODO: formats */
-   if (usage & PIPE_BIND_VERTEX_BUFFER) {
-      switch (format) {
-      case PIPE_FORMAT_R32_FLOAT:
-      case PIPE_FORMAT_R32G32_FLOAT:
-      case PIPE_FORMAT_R32G32B32_FLOAT:
-      case PIPE_FORMAT_R32G32B32A32_FLOAT:
-         break;
-      default:
-         return false;
-      }
-   }
+   if ((usage & PIPE_BIND_VERTEX_BUFFER) && !agx_vbo_supports_format(format))
+      return false;

   if (usage & PIPE_BIND_DEPTH_STENCIL) {
      switch (format) {
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@ -987,18 +987,13 @@ agx_create_vertex_elements(struct pipe_context *ctx,

      const struct util_format_description *desc =
         util_format_description(ve.src_format);
-
      unsigned chan_size = desc->channel[0].size / 8;
-
-      assert(chan_size == 1 || chan_size == 2 || chan_size == 4);
-      assert(desc->nr_channels >= 1 && desc->nr_channels <= 4);
      assert((ve.src_offset & (chan_size - 1)) == 0);

      attribs[i] = (struct agx_attribute) {
         .buf = ve.vertex_buffer_index,
-         .src_offset = ve.src_offset / chan_size,
-         .nr_comps_minus_1 = desc->nr_channels - 1,
-         .format = agx_vertex_format[ve.src_format],
+         .src_offset = ve.src_offset,
+         .format = ve.src_format,
         .divisor = ve.instance_divisor
      };
   }
@ -1184,7 +1179,9 @@ agx_compile_variant(struct agx_device *dev,

   agx_preprocess_nir(nir);

-   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+   if (nir->info.stage == MESA_SHADER_VERTEX) {
+      NIR_PASS_V(nir, agx_nir_lower_vbo, &key->vbuf);
+   } else {
      struct agx_tilebuffer_layout tib =
         agx_build_tilebuffer_layout(key->rt_formats, key->nr_cbufs, 1);

@ -1243,13 +1240,12 @@ agx_create_shader_state(struct pipe_context *pctx,
      switch (so->nir->info.stage) {
      case MESA_SHADER_VERTEX:
      {
-         key.base.vs.num_vbufs = AGX_MAX_VBUFS;
+         key.vbuf.count = AGX_MAX_VBUFS;
         for (unsigned i = 0; i < AGX_MAX_VBUFS; ++i) {
-            key.base.vs.vbuf_strides[i] = 16;
-            key.base.vs.attributes[i] = (struct agx_attribute) {
+            key.vbuf.strides[i] = 16;
+            key.vbuf.attributes[i] = (struct agx_attribute) {
               .buf = i,
-               .nr_comps_minus_1 = 4 - 1,
-               .format = AGX_FORMAT_I32
+               .format = PIPE_FORMAT_R32G32B32A32_FLOAT
            };
         }

@ -1295,20 +1291,18 @@ agx_update_shader(struct agx_context *ctx, struct agx_compiled_shader **out,
 static bool
 agx_update_vs(struct agx_context *ctx)
 {
-   struct agx_vs_shader_key key = { 0 };
-
-   memcpy(key.attributes, ctx->attributes,
-          sizeof(key.attributes[0]) * AGX_MAX_ATTRIBS);
-
-   u_foreach_bit(i, ctx->vb_mask) {
-      key.vbuf_strides[i] = ctx->vertex_buffers[i].stride;
-   }
-
-   struct asahi_shader_key akey = {
-      .base.vs = key
+   struct asahi_shader_key key = {
+      .vbuf.count = util_last_bit(ctx->vb_mask),
   };

-   return agx_update_shader(ctx, &ctx->vs, PIPE_SHADER_VERTEX, &akey);
+   memcpy(key.vbuf.attributes, ctx->attributes,
+          sizeof(key.vbuf.attributes[0]) * AGX_MAX_ATTRIBS);
+
+   u_foreach_bit(i, ctx->vb_mask) {
+      key.vbuf.strides[i] = ctx->vertex_buffers[i].stride;
+   }
+
+   return agx_update_shader(ctx, &ctx->vs, PIPE_SHADER_VERTEX, &key);
 }

 static bool
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@ -34,6 +34,7 @@
 #include "asahi/lib/agx_device.h"
 #include "asahi/lib/pool.h"
 #include "asahi/lib/agx_tilebuffer.h"
+#include "asahi/lib/agx_nir_lower_vbo.h"
 #include "asahi/compiler/agx_compile.h"
 #include "asahi/layout/layout.h"
 #include "compiler/nir/nir_lower_blend.h"
@ -142,6 +143,8 @@ struct agx_blend {

 struct asahi_shader_key {
   struct agx_shader_key base;
+   struct agx_vbufs vbuf;
+
   struct agx_blend blend;
   unsigned nr_cbufs;