From 1140bdb783b7a99d3380dcabcaa2c9ac525bc252 Mon Sep 17 00:00:00 2001
From: Asahi Lina <lina@asahilina.net>
Date: Wed, 28 Jun 2023 19:15:28 +0900
Subject: [PATCH] asahi: Arrange VS varyings in the correct order

The GPU ABI requires varyings to be grouped as follows:

- Position
- Smooth shaded fp32
- Flat shaded fp32
- Linear shaded fp32
- Smooth shaded fp16
- Flat shaded fp16
- Linear shaded fp16
- Point size

Use the flat shaded mask info we now have in the vertex shader key to
sort things properly, and pass the counts to the hardware.
FP16 is still TODO.

Signed-off-by: Asahi Lina <lina@asahilina.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23998>
---
 src/asahi/compiler/agx_compile.c      | 38 ++++++++++++++++++++++++---
 src/asahi/compiler/agx_compile.h      | 11 ++++++++
 src/gallium/drivers/asahi/agx_state.c | 15 ++++++++---
 3 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c
index 43b536c81bd..6ab3fed0536 100644
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -2018,7 +2018,8 @@ agx_optimize_nir(nir_shader *nir, unsigned *preamble_size)
 
 /* ABI: position first, then user, then psiz */
 static void
-agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings_vs *varyings)
+agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings_vs *varyings,
+                      struct agx_shader_key *key)
 {
    unsigned base = 0;
 
@@ -2033,16 +2034,47 @@ agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings_vs *varyings)
    varyings->slots[VARYING_SLOT_POS] = base;
    base += 4;
 
-   u_foreach_bit64(loc, nir->info.outputs_written) {
+   assert(!(key->vs.outputs_flat_shaded & key->vs.outputs_linear_shaded));
+
+   /* Smooth 32-bit user bindings go next */
+   u_foreach_bit64(loc, nir->info.outputs_written &
+                           ~key->vs.outputs_flat_shaded &
+                           ~key->vs.outputs_linear_shaded) {
       if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
          continue;
 
       varyings->slots[loc] = base;
       base += 4;
+      varyings->num_32_smooth += 4;
+   }
+
+   /* Flat 32-bit user bindings go next */
+   u_foreach_bit64(loc,
+                   nir->info.outputs_written & key->vs.outputs_flat_shaded) {
+      if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
+         continue;
+
+      varyings->slots[loc] = base;
+      base += 4;
+      varyings->num_32_flat += 4;
+   }
+
+   /* Linear 32-bit user bindings go next */
+   u_foreach_bit64(loc,
+                   nir->info.outputs_written & key->vs.outputs_linear_shaded) {
+      if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
+         continue;
+
+      varyings->slots[loc] = base;
+      base += 4;
+      varyings->num_32_linear += 4;
    }
 
    /* TODO: Link FP16 varyings */
    varyings->base_index_fp16 = base;
+   varyings->num_16_smooth = 0;
+   varyings->num_16_flat = 0;
+   varyings->num_16_linear = 0;
 
    if (nir->info.outputs_written & VARYING_BIT_PSIZ) {
       varyings->slots[VARYING_SLOT_PSIZ] = base;
@@ -2471,7 +2503,7 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
 
    /* Must be last since NIR passes can remap driver_location freely */
    if (nir->info.stage == MESA_SHADER_VERTEX)
-      agx_remap_varyings_vs(nir, &out->varyings.vs);
+      agx_remap_varyings_vs(nir, &out->varyings.vs, key);
 
    if (agx_should_dump(nir, AGX_DBG_SHADERS))
       nir_print_shader(nir, stdout);
diff --git a/src/asahi/compiler/agx_compile.h b/src/asahi/compiler/agx_compile.h
index a38aa0922b3..0ef4b9342b2 100644
--- a/src/asahi/compiler/agx_compile.h
+++ b/src/asahi/compiler/agx_compile.h
@@ -10,6 +10,17 @@
 #include "util/u_dynarray.h"
 
 struct agx_varyings_vs {
+   /* The number of user varyings of each type. The varyings must be allocated
+    * in this order ({smooth, flat, linear} × {32, 16}), which may require
+    * remapping.
+    */
+   unsigned num_32_smooth;
+   unsigned num_32_flat;
+   unsigned num_32_linear;
+   unsigned num_16_smooth;
+   unsigned num_16_flat;
+   unsigned num_16_linear;
+
    /* The first index used for FP16 varyings. Indices less than this are treated
     * as FP32. This may require remapping slots to guarantee.
     */
diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index 4ae696e1aa9..592ec625eac 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -2212,7 +2212,6 @@ agx_batch_init_state(struct agx_batch *batch)
    struct agx_ppp_update ppp =
       agx_new_ppp_update(&batch->pool, (struct AGX_PPP_HEADER){
                                           .w_clamp = true,
-                                          .varying_counts_16 = true,
                                           .cull_2 = true,
                                           .occlusion_query_2 = true,
                                           .output_unknown = true,
@@ -2221,7 +2220,6 @@ agx_batch_init_state(struct agx_batch *batch)
 
    /* clang-format off */
    agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10;
-   agx_ppp_push(&ppp, VARYING_COUNTS, cfg);
    agx_ppp_push(&ppp, CULL_2, cfg);
    agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg);
    agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg);
@@ -2416,6 +2414,7 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out, bool is_lines,
       .fragment_back_stencil = IS_DIRTY(ZS),
       .output_select = IS_DIRTY(VS_PROG) || IS_DIRTY(FS_PROG),
       .varying_counts_32 = IS_DIRTY(VS_PROG),
+      .varying_counts_16 = IS_DIRTY(VS_PROG),
       .cull = IS_DIRTY(RS),
       .fragment_shader =
          IS_DIRTY(FS) || varyings_dirty || IS_DIRTY(SAMPLE_MASK),
@@ -2506,9 +2505,19 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out, bool is_lines,
       }
    }
 
+   assert(dirty.varying_counts_32 == dirty.varying_counts_16);
+
    if (dirty.varying_counts_32) {
       agx_ppp_push(&ppp, VARYING_COUNTS, cfg) {
-         cfg.smooth = agx_num_general_outputs(&ctx->vs->info.varyings.vs);
+         cfg.smooth = vs->info.varyings.vs.num_32_smooth;
+         cfg.flat = vs->info.varyings.vs.num_32_flat;
+         cfg.linear = vs->info.varyings.vs.num_32_linear;
+      }
+
+      agx_ppp_push(&ppp, VARYING_COUNTS, cfg) {
+         cfg.smooth = vs->info.varyings.vs.num_16_smooth;
+         cfg.flat = vs->info.varyings.vs.num_16_flat;
+         cfg.linear = vs->info.varyings.vs.num_16_linear;
       }
    }