From c43c90a5faf37f48e245165c5c3df837c5d08db0 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Mon, 27 Nov 2023 14:39:56 -0400
Subject: [PATCH] asahi: rewrite pointsize handling

In the wise words of Mike Blumenkrantz, "I hate gl_PointSize and so can you".

The mesa/st lowering won't mesh well with vertex shader epilogues, and it falls
over in various circumstances. I am too tired to go against the grain, so let's
just pretend to be a normal gallium driver and trust in the rasterizer CSO,
lowering point size internally. This properly handles transform feedback without
any hacks, both GL and GLES behaviours, etc.

Fixes:

   KHR-GL31.transform_feedback.capture_vertex_separate_test
   gl-2.0-large-point-fs

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26614>
---
 src/asahi/compiler/agx_compile.c              |  4 --
 src/compiler/nir/nir_intrinsics.py            |  3 +
 .../drivers/asahi/agx_nir_lower_point_size.c  | 58 +++++++++++++++++
 .../drivers/asahi/agx_nir_lower_sysvals.c     |  2 +
 src/gallium/drivers/asahi/agx_pipe.c          |  8 ---
 src/gallium/drivers/asahi/agx_state.c         | 62 ++++++++++++-------
 src/gallium/drivers/asahi/agx_state.h         |  6 ++
 src/gallium/drivers/asahi/meson.build         |  1 +
 8 files changed, 109 insertions(+), 35 deletions(-)
 create mode 100644 src/gallium/drivers/asahi/agx_nir_lower_point_size.c

diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c
index 6c008fabc5b..b3a073cb83d 100644
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -2906,10 +2906,6 @@ agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx,
 
    NIR_PASS_V(nir, nir_lower_vars_to_ssa);
 
-   if (nir->info.stage == MESA_SHADER_VERTEX) {
-      NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0);
-   }
-
    /* Lower large arrays to scratch and small arrays to csel */
    NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
               glsl_get_natural_size_align_bytes);
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 48c7a28c527..feab2cd4974 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -1842,6 +1842,9 @@ system_value("api_sample_mask_agx", 1, bit_sizes=[16])
 # Loads the sample position array as fixed point packed into a 32-bit word
 system_value("sample_positions_agx", 1, bit_sizes=[32])
 
+# Loads the fixed-function glPointSize() value
+system_value("fixed_point_size_agx", 1, bit_sizes=[32])
+
 # Image loads go through the texture cache, which is not coherent with the PBE
 # or memory access, so fencing is necessary for writes to become visible.
 
diff --git a/src/gallium/drivers/asahi/agx_nir_lower_point_size.c b/src/gallium/drivers/asahi/agx_nir_lower_point_size.c
new file mode 100644
index 00000000000..98216f4c0f8
--- /dev/null
+++ b/src/gallium/drivers/asahi/agx_nir_lower_point_size.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2023 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#include "agx_state.h"
+#include "nir_builder.h"
+
+/*
+ * gl_PointSize lowering. This runs late on a vertex shader (epilogue). By this
+ * time, I/O has been lowered, and transform feedback has been written. Point
+ * size will thus only get consumed by the rasterizer, so we can clamp/replace.
+ * We do this instead of the mesa/st lowerings for better behaviour with lowered
+ * I/O and vertex epilogues.
+ */
+
+static bool
+pass(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   bool *program_point_size = data;
+   b->cursor = nir_before_instr(&intr->instr);
+
+   if ((intr->intrinsic != nir_intrinsic_store_output) ||
+       (nir_intrinsic_io_semantics(intr).location != VARYING_SLOT_PSIZ))
+      return false;
+
+   if (*program_point_size) {
+      /* We want to use this point size. Clamp it. */
+      nir_src_rewrite(&intr->src[0],
+                      nir_fmax(b, intr->src[0].ssa, nir_imm_float(b, 1.0f)));
+   } else {
+      /* We want to override point size. Remove this store. */
+      nir_instr_remove(&intr->instr);
+   }
+
+   return true;
+}
+
+void
+agx_nir_lower_point_size(nir_shader *nir, bool program_point_size)
+{
+   /* Handle existing point size write */
+   nir_shader_intrinsics_pass(nir, pass,
+                              nir_metadata_block_index | nir_metadata_dominance,
+                              &program_point_size);
+
+   /* Write the fixed-function point size if we have one */
+   if (!program_point_size) {
+      nir_builder b =
+         nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir)));
+
+      nir_store_output(
+         &b, nir_load_fixed_point_size_agx(&b), nir_imm_int(&b, 0),
+         .io_semantics.location = VARYING_SLOT_PSIZ,
+         .io_semantics.num_slots = 1, .write_mask = nir_component_mask(1));
+
+      nir->info.outputs_written |= VARYING_BIT_PSIZ;
+   }
+}
diff --git a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
index 3ca7b895780..b99f8465890 100644
--- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
+++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
@@ -158,6 +158,8 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
       return load_sysval_root(b, 1, 64, &u->input_assembly);
    case nir_intrinsic_load_geometry_param_buffer_agx:
       return load_sysval_root(b, 1, 64, &u->geometry_params);
+   case nir_intrinsic_load_fixed_point_size_agx:
+      return load_sysval_root(b, 1, 32, &u->fixed_point_size);
    default:
       break;
    }
diff --git a/src/gallium/drivers/asahi/agx_pipe.c b/src/gallium/drivers/asahi/agx_pipe.c
index 501d66bc908..409cbc810d9 100644
--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@@ -1697,13 +1697,6 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEX_ATTRIB_ELEMENT_ALIGNED_ONLY:
       return 1;
 
-   /* We run nir_lower_point_size so we need the GLSL linker to copy
-    * the original gl_PointSize when captured by transform feedback. We could
-    * also copy it ourselves but it's easier to set the CAP.
-    */
-   case PIPE_CAP_PSIZ_CLAMPED:
-      return 1;
-
    case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
       return 16384;
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
@@ -1758,7 +1751,6 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_FLATSHADE:
    case PIPE_CAP_TWO_SIDED_COLOR:
    case PIPE_CAP_ALPHA_TEST:
-   case PIPE_CAP_POINT_SIZE_FIXED:
    case PIPE_CAP_CLIP_PLANES:
    case PIPE_CAP_NIR_IMAGES_AS_DEREF:
       return 0;
diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index c73a007a8dc..6d4268b018e 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -50,6 +50,7 @@
 #include "agx_nir_lower_gs.h"
 #include "agx_tilebuffer.h"
 #include "nir_builder.h"
+#include "nir_builder_opcodes.h"
 #include "nir_intrinsics.h"
 #include "nir_intrinsics_indices.h"
 #include "pool.h"
@@ -1697,6 +1698,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
       struct asahi_vs_shader_key *key = &key_->vs;
 
       NIR_PASS_V(nir, agx_nir_lower_vbo, &key->vbuf);
+      NIR_PASS_V(nir, agx_nir_lower_point_size, key->program_point_size);
 
       if (should_lower_clip_m1_1(dev, key->clip_halfz)) {
          NIR_PASS_V(nir, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1,
@@ -2126,6 +2128,19 @@ agx_update_shader(struct agx_context *ctx, struct agx_compiled_shader **out,
    return true;
 }
 
+static enum mesa_prim
+rast_prim(enum mesa_prim mode, unsigned fill_mode)
+{
+   if (u_reduced_prim(mode) == MESA_PRIM_TRIANGLES) {
+      if (fill_mode == PIPE_POLYGON_MODE_POINT)
+         return MESA_PRIM_POINTS;
+      else if (fill_mode == PIPE_POLYGON_MODE_LINE)
+         return MESA_PRIM_LINES;
+   }
+
+   return mode;
+}
+
 static bool
 agx_update_vs(struct agx_context *ctx)
 {
@@ -2136,12 +2151,22 @@ agx_update_vs(struct agx_context *ctx)
     * outputs_{flat,linear}_shaded: FS_PROG
     */
    if (!(ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX | AGX_DIRTY_XFB |
-                       AGX_DIRTY_FS_PROG | AGX_DIRTY_RS)))
+                       AGX_DIRTY_FS_PROG | AGX_DIRTY_RS | AGX_DIRTY_PRIM)))
       return false;
 
+   enum mesa_prim rasterized_prim =
+      rast_prim(ctx->batch->reduced_prim, ctx->rast->base.fill_front);
+
    struct asahi_vs_shader_key key = {
       .vbuf.count = util_last_bit(ctx->vb_mask),
       .clip_halfz = ctx->rast->base.clip_halfz,
+
+      /* If we are not rasterizing points, set program_point_size to eliminate
+       * the useless point size write.
+       */
+      .program_point_size = ctx->rast->base.point_size_per_vertex ||
+                            rasterized_prim != MESA_PRIM_POINTS,
+
       .outputs_flat_shaded =
          ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
       .outputs_linear_shaded =
@@ -3824,19 +3849,6 @@ agx_needs_passthrough_gs(struct agx_context *ctx,
    return false;
 }
 
-static enum mesa_prim
-rast_prim(enum mesa_prim mode, unsigned fill_mode)
-{
-   if (u_reduced_prim(mode) == MESA_PRIM_TRIANGLES) {
-      if (fill_mode == PIPE_POLYGON_MODE_POINT)
-         return MESA_PRIM_POINTS;
-      else if (fill_mode == PIPE_POLYGON_MODE_LINE)
-         return MESA_PRIM_LINES;
-   }
-
-   return mode;
-}
-
 static struct agx_uncompiled_shader *
 agx_get_passthrough_gs(struct agx_context *ctx,
                        struct agx_uncompiled_shader *prev_cso,
@@ -4031,6 +4043,14 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
 
    agx_batch_init_state(batch);
 
+   /* Dirty track the reduced prim: lines vs points vs triangles. Happens before
+    * agx_update_vs/agx_update_fs, which specialize based on primitive.
+    */
+   enum mesa_prim reduced_prim = u_reduced_prim(info->mode);
+   if (reduced_prim != batch->reduced_prim)
+      ctx->dirty |= AGX_DIRTY_PRIM;
+   batch->reduced_prim = reduced_prim;
+
    /* Update shaders first so we can use them after */
    if (agx_update_vs(ctx)) {
       ctx->dirty |= AGX_DIRTY_VS | AGX_DIRTY_VS_PROG;
@@ -4060,14 +4080,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
       ctx->dirty |= AGX_DIRTY_VS;
    }
 
-   /* Dirty track the reduced prim: lines vs points vs triangles. Happens before
-    * agx_update_fs, which specializes based on primitive.
-    */
-   enum mesa_prim reduced_prim = u_reduced_prim(info->mode);
-   if (reduced_prim != batch->reduced_prim)
-      ctx->dirty |= AGX_DIRTY_PRIM;
-   batch->reduced_prim = reduced_prim;
-
    if (agx_update_fs(batch)) {
       ctx->dirty |= AGX_DIRTY_FS | AGX_DIRTY_FS_PROG;
       ctx->stage[PIPE_SHADER_FRAGMENT].dirty = ~0;
@@ -4132,8 +4144,12 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
              sizeof(ctx->blend_color));
    }
 
+   if (IS_DIRTY(RS)) {
+      batch->uniforms.fixed_point_size = ctx->rast->base.point_size;
+   }
+
    if (IS_DIRTY(VS) || IS_DIRTY(FS) || ctx->gs || IS_DIRTY(VERTEX) ||
-       IS_DIRTY(BLEND_COLOR)) {
+       IS_DIRTY(BLEND_COLOR) || IS_DIRTY(RS)) {
 
       agx_upload_uniforms(batch);
    }
diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h
index 25b76e8dd5b..29225ce2f91 100644
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -108,6 +108,9 @@ struct PACKED agx_draw_uniforms {
    /* Blend constant if any */
    float blend_constant[4];
 
+   /* glPointSize value */
+   float fixed_point_size;
+
    /* Value of the multisample control register, containing sample positions in
     * each byte (x in low nibble, y in high nibble).
     */
@@ -364,6 +367,7 @@ struct agx_blend {
 struct asahi_vs_shader_key {
    struct agx_vbufs vbuf;
    bool clip_halfz;
+   bool program_point_size;
    uint64_t outputs_flat_shaded;
    uint64_t outputs_linear_shaded;
 };
@@ -796,6 +800,8 @@ void agx_upload_uniforms(struct agx_batch *batch);
 uint64_t agx_upload_stage_uniforms(struct agx_batch *batch, uint64_t textures,
                                    enum pipe_shader_type stage);
 
+void agx_nir_lower_point_size(nir_shader *nir, bool program_point_size);
+
 bool agx_nir_lower_sysvals(nir_shader *shader, bool lower_draw_params);
 
 bool agx_nir_layout_uniforms(nir_shader *shader,
diff --git a/src/gallium/drivers/asahi/meson.build b/src/gallium/drivers/asahi/meson.build
index 17bfabf2d6d..140024e3f9b 100644
--- a/src/gallium/drivers/asahi/meson.build
+++ b/src/gallium/drivers/asahi/meson.build
@@ -10,6 +10,7 @@ files_asahi = files(
   'agx_pipe.c',
   'agx_nir_lower_sysvals.c',
   'agx_nir_lower_bindings.c',
+  'agx_nir_lower_point_size.c',
   'agx_query.c',
   'agx_state.c',
   'agx_streamout.c',