From fccc35c2def2293b7adb313265b62d4aa198ff9e Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Fri, 24 Sep 2021 19:04:04 +0200
Subject: [PATCH] ir3: Add preamble optimization pass

Now that everything is plumbed through, we can tie it together.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13148>
---
 src/freedreno/ir3/ir3_compiler.c              |   3 +
 src/freedreno/ir3/ir3_compiler.h              |   4 +
 src/freedreno/ir3/ir3_nir.c                   |  20 +-
 src/freedreno/ir3/ir3_nir.h                   |   2 +
 .../ir3/ir3_nir_analyze_ubo_ranges.c          |   4 +-
 src/freedreno/ir3/ir3_nir_opt_preamble.c      | 420 ++++++++++++++++++
 src/freedreno/ir3/ir3_shader.h                |   3 +
 src/freedreno/ir3/meson.build                 |   1 +
 8 files changed, 455 insertions(+), 2 deletions(-)
 create mode 100644 src/freedreno/ir3/ir3_nir_opt_preamble.c

diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index 3c9e0db7d57..0d10ae8f23e 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -45,6 +45,7 @@ static const struct debug_named_value shader_debug_options[] = {
    {"nofp16",     IR3_DBG_NOFP16,     "Don't lower mediump to fp16"},
    {"nocache",    IR3_DBG_NOCACHE,    "Disable shader cache"},
    {"spillall",   IR3_DBG_SPILLALL,   "Spill as much as possible to test the spiller"},
+   {"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"},
 #ifdef DEBUG
    /* DEBUG-only options: */
    {"schedmsgs",  IR3_DBG_SCHEDMSGS,  "Enable scheduler debug messages"},
@@ -245,6 +246,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
       /* TODO: implement private memory on earlier gen's */
       compiler->has_pvtmem = true;
 
+      compiler->has_preamble = true;
+
       compiler->tess_use_shared = dev_info->a6xx.tess_use_shared;
 
       compiler->storage_16bit = dev_info->a6xx.storage_16bit;
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index ddbd152ea7c..1521d58c8cf 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -182,6 +182,9 @@ struct ir3_compiler {
     * constbuf. a5xx+ has the shared regfile.
     */
    bool has_shared_regfile;
+
+   /* True if preamble instructions (shps, shpe, etc.) are supported */
+   bool has_preamble;
 };
 
 void ir3_compiler_destroy(struct ir3_compiler *compiler);
@@ -224,6 +227,7 @@ enum ir3_shader_debug {
    IR3_DBG_NOFP16 = BITFIELD_BIT(10),
    IR3_DBG_NOCACHE = BITFIELD_BIT(11),
    IR3_DBG_SPILLALL = BITFIELD_BIT(12),
+   IR3_DBG_NOPREAMBLE = BITFIELD_BIT(13),
 
    /* DEBUG-only options: */
    IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20),
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 207a327f94f..96529f35727 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -640,11 +640,28 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
    progress |= OPT(s, ir3_nir_lower_64b_undef);
    progress |= OPT(s, nir_lower_int64);
 
+   /* Cleanup code leftover from lowering passes before opt_preamble */
+   if (progress) {
+      progress |= OPT(s, nir_opt_constant_folding);
+   }
+
+   /* Do the preamble before analysing UBO ranges, because it's usually
+    * higher-value and because it can result in eliminating some indirect UBO
+    * accesses where otherwise we'd have to push the whole range. However we
+    * have to lower the preamble after UBO lowering so that UBO lowering can
+    * insert instructions in the preamble to push UBOs.
+    */
+   if (so->shader->compiler->has_preamble &&
+       !(ir3_shader_debug & IR3_DBG_NOPREAMBLE))
+      progress |= OPT(s, ir3_nir_opt_preamble, so);
+
    if (!so->binning_pass)
       OPT_V(s, ir3_nir_analyze_ubo_ranges, so);
 
    progress |= OPT(s, ir3_nir_lower_ubo_loads, so);
 
+   progress |= OPT(s, ir3_nir_lower_preamble, so);
+
    OPT_V(s, nir_lower_amul, ir3_glsl_type_size);
 
    /* UBO offset lowering has to come after we've decided what will
@@ -826,7 +843,8 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
 
    debug_assert((const_state->ubo_state.size % 16) == 0);
    unsigned constoff = v->shader->num_reserved_user_consts +
-      const_state->ubo_state.size / 16;
+      const_state->ubo_state.size / 16 +
+      const_state->preamble_size;
    unsigned ptrsz = ir3_pointer_size(compiler);
 
    if (const_state->num_ubos > 0) {
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
index 0ad5f09766c..7a780191d32 100644
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -73,6 +73,8 @@ bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v);
 void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v);
 bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v);
 bool ir3_nir_fixup_load_uniform(nir_shader *nir);
+bool ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v);
+bool ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v);
 
 nir_ssa_def *ir3_nir_try_propagate_bit_shift(nir_builder *b,
                                              nir_ssa_def *offset,
diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
index 5ee39737016..dea75e95c9e 100644
--- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
+++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
@@ -369,7 +369,9 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v)
     * allocation of the driver params' const space, because UBO pointers can
     * be driver params but this pass usually eliminatings them.
     */
-   struct ir3_const_state worst_case_const_state = {};
+   struct ir3_const_state worst_case_const_state = {
+      .preamble_size = const_state->preamble_size,
+   };
    ir3_setup_const_state(nir, v, &worst_case_const_state);
    const uint32_t max_upload =
       (ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 16;
diff --git a/src/freedreno/ir3/ir3_nir_opt_preamble.c b/src/freedreno/ir3/ir3_nir_opt_preamble.c
new file mode 100644
index 00000000000..7c5c60c78d9
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_opt_preamble.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright © 2021 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "ir3_compiler.h"
+#include "ir3_nir.h"
+
+/* Preamble optimization happens in two parts: first we generate the preamble
+ * using the generic NIR pass, then we setup the preamble sequence and inline
+ * the preamble into the main shader if there was a preamble. The first part
+ * should happen before UBO lowering, because we want to prefer more complex
+ * expressions over UBO loads, but the second part has to happen after UBO
+ * lowering because it may add copy instructions to the preamble.
+ */
+
+static void
+def_size(nir_ssa_def *def, unsigned *size, unsigned *align)
+{
+   unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
+   /* Due to the implicit const file promotion we want to expand 16-bit values
+    * to 32-bit so that the truncation in the main shader can hopefully be
+    * folded into the use.
+    */
+   *size = DIV_ROUND_UP(bit_size, 32) * def->num_components;
+   *align = 1;
+}
+
+static bool
+all_uses_float(nir_ssa_def *def, bool allow_src2)
+{
+   nir_foreach_if_use (use, def) {
+      return false;
+   }
+
+   nir_foreach_use (use, def) {
+      nir_instr *use_instr = use->parent_instr;
+      if (use_instr->type != nir_instr_type_alu)
+         return false;
+      nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
+      unsigned src_index = ~0;
+      for  (unsigned i = 0; i < nir_op_infos[use_alu->op].num_inputs; i++) {
+         if (&use_alu->src[i].src == use) {
+            src_index = i;
+            break;
+         }
+      }
+
+      assert(src_index != ~0);
+      nir_alu_type src_type =
+         nir_alu_type_get_base_type(nir_op_infos[use_alu->op].input_types[src_index]);
+
+      if (src_type != nir_type_float || (src_index == 2 && !allow_src2))
+         return false;
+   }
+
+   return true;
+}
+
+static bool
+all_uses_bit(nir_ssa_def *def)
+{
+   nir_foreach_if_use (use, def) {
+      return false;
+   }
+
+   nir_foreach_use (use, def) {
+      nir_instr *use_instr = use->parent_instr;
+      if (use_instr->type != nir_instr_type_alu)
+         return false;
+      nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
+      
+      /* See ir3_cat2_absneg() */
+      switch (use_alu->op) {
+      case nir_op_iand:
+      case nir_op_ior:
+      case nir_op_inot:
+      case nir_op_ixor:
+      case nir_op_bitfield_reverse:
+      case nir_op_ufind_msb:
+      case nir_op_ifind_msb:
+      case nir_op_find_lsb:
+      case nir_op_ishl:
+      case nir_op_ushr:
+      case nir_op_ishr:
+      case nir_op_bit_count:
+         continue;
+      default:
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static float
+instr_cost(nir_instr *instr, const void *data)
+{
+   /* We'll assume wave64 here for simplicity and assume normal cat1-cat3 ops
+    * take 1 (normalized) cycle.
+    *
+    * See https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A6xx-SP
+    *
+    * TODO: assume wave128 on fragment/compute shaders?
+    */
+
+   switch (instr->type) {
+   case nir_instr_type_alu: {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+      unsigned components = alu->dest.dest.ssa.num_components;
+      switch (alu->op) {
+      /* cat4 */
+      case nir_op_frcp:
+      case nir_op_fsqrt:
+      case nir_op_frsq:
+      case nir_op_flog2:
+      case nir_op_fexp2:
+      case nir_op_fsin:
+      case nir_op_fcos:
+         return 4 * components;
+
+      /* Instructions that become src modifiers. Note for conversions this is
+       * really an approximation.
+       *
+       * This prevents silly things like lifting a negate that would become a
+       * modifier.
+       */
+      case nir_op_f2f32:
+      case nir_op_f2f16:
+      case nir_op_f2fmp:
+      case nir_op_fneg:
+         return all_uses_float(&alu->dest.dest.ssa, true) ? 0 : 1 * components;
+
+      case nir_op_fabs:
+         return all_uses_float(&alu->dest.dest.ssa, false) ? 0 : 1 * components;
+
+      case nir_op_inot:
+         return all_uses_bit(&alu->dest.dest.ssa) ? 0 : 1 * components;
+
+      /* Instructions that become vector split/collect */
+      case nir_op_vec2:
+      case nir_op_vec3:
+      case nir_op_vec4:
+      case nir_op_mov:
+         return 0;
+
+      /* cat1-cat3 */
+      default:
+         return 1 * components;
+      }
+      break;
+   }
+
+   case nir_instr_type_tex:
+      /* cat5 */
+      return 8;
+
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_load_ubo: {
+         /* If the UBO and offset are constant, then UBO lowering should do a
+          * better job trying to lower this, and opt_preamble shouldn't try to
+          * duplicate it. However if it has a non-constant offset then we can
+          * avoid setting up a0.x etc. in the main shader and potentially have
+          * to push less.
+          */
+         bool const_ubo = nir_src_is_const(intrin->src[0]);
+         if (!const_ubo) {
+            nir_intrinsic_instr *rsrc = ir3_bindless_resource(intrin->src[0]);
+            if (rsrc)
+               const_ubo = nir_src_is_const(rsrc->src[0]);
+         }
+
+         if (const_ubo && nir_src_is_const(intrin->src[1]))
+            return 0;
+
+         /* TODO: get actual numbers for ldc */
+         return 8;
+      }
+
+      case nir_intrinsic_load_ssbo:
+      case nir_intrinsic_load_ssbo_ir3:
+      case nir_intrinsic_get_ssbo_size:
+      case nir_intrinsic_image_load:
+      case nir_intrinsic_bindless_image_load:
+         /* cat5/isam */
+         return 8;
+
+      /* By default assume it's a sysval or something */
+      default:
+         return 0;
+      }
+   }
+
+   default:
+      return 0;
+   }
+}
+
+static float
+rewrite_cost(nir_ssa_def *def, const void *data)
+{
+   /* We always have to expand booleans */
+   if (def->bit_size == 1)
+      return def->num_components;
+
+   bool mov_needed = false;
+   nir_foreach_use (use, def) {
+      nir_instr *parent_instr = use->parent_instr;
+      if (parent_instr->type != nir_instr_type_alu) {
+         mov_needed = true;
+         break;
+      } else {
+         nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
+         if (alu->op == nir_op_vec2 ||
+             alu->op == nir_op_vec3 ||
+             alu->op == nir_op_vec4 ||
+             alu->op == nir_op_mov) {
+            mov_needed = true;
+            break;
+         } else {
+            /* Assume for non-moves that the const is folded into the src */
+         }
+      }
+   }
+
+   return mov_needed ? def->num_components : 0;
+}
+
+static bool
+avoid_instr(const nir_instr *instr, const void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   
+   return intrin->intrinsic == nir_intrinsic_bindless_resource_ir3;
+}
+
+bool
+ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v)
+{
+   struct ir3_const_state *const_state = ir3_const_state(v);
+
+   unsigned max_size;
+   if (v->binning_pass) {
+      max_size = const_state->preamble_size * 4;
+   } else {
+      struct ir3_const_state worst_case_const_state = {};
+      ir3_setup_const_state(nir, v, &worst_case_const_state);
+      max_size = (ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 4;
+   }
+
+   if (max_size == 0)
+      return false;
+
+   nir_opt_preamble_options options = {
+      .drawid_uniform = true,
+      .subgroup_size_uniform = true,
+      .def_size = def_size,
+      .preamble_storage_size = max_size,
+      .instr_cost_cb = instr_cost,
+      .avoid_instr_cb = avoid_instr,
+      .rewrite_cost_cb = rewrite_cost,
+   };
+
+   unsigned size;
+   bool progress = nir_opt_preamble(nir, &options, &size);
+
+   if (!v->binning_pass)
+      const_state->preamble_size = DIV_ROUND_UP(size, 4);
+
+   return progress;
+}
+
+bool
+ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v)
+{
+   nir_function_impl *main = nir_shader_get_entrypoint(nir);
+   
+   if (!main->preamble)
+      return false;
+
+   nir_function_impl *preamble = main->preamble->impl;
+
+   /* First, lower load/store_preamble. */  
+   const struct ir3_const_state *const_state = ir3_const_state(v);
+   unsigned preamble_base = v->shader->num_reserved_user_consts * 4 +
+      const_state->ubo_state.size / 4;
+   unsigned preamble_size = const_state->preamble_size * 4;
+
+   BITSET_DECLARE(promoted_to_float, preamble_size);
+   memset(promoted_to_float, 0, sizeof(promoted_to_float));
+
+   nir_builder _b;
+   nir_builder *b = &_b;
+   nir_builder_init(b, main);
+
+   nir_foreach_block (block, main) {
+      nir_foreach_instr_safe (instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         if (intrin->intrinsic != nir_intrinsic_load_preamble)
+            continue;
+
+         nir_ssa_def *dest = &intrin->dest.ssa;
+
+         unsigned offset = preamble_base + nir_intrinsic_base(intrin);
+         b->cursor = nir_before_instr(instr);
+
+         nir_ssa_def *new_dest =
+            nir_load_uniform(b, dest->num_components, 32, nir_imm_int(b, 0),
+                             .base = offset);
+
+         if (dest->bit_size == 1) {
+            new_dest = nir_i2b1(b, new_dest);
+         } else if (dest->bit_size != 32) {
+            assert(dest->bit_size == 16);
+            if (all_uses_float(dest, true)) {
+               new_dest = nir_f2f16(b, new_dest);
+               BITSET_SET(promoted_to_float, nir_intrinsic_base(intrin));
+            } else {
+               new_dest = nir_u2u16(b, new_dest);
+            }
+         }
+
+         nir_ssa_def_rewrite_uses(dest, new_dest);
+         nir_instr_remove(instr);
+         nir_instr_free(instr);
+      }
+   }
+
+   nir_builder_init(b, preamble);
+
+   nir_foreach_block (block, preamble) {
+      nir_foreach_instr_safe (instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         if (intrin->intrinsic != nir_intrinsic_store_preamble)
+            continue;
+
+         nir_ssa_def *src = intrin->src[0].ssa;
+         unsigned offset = preamble_base + nir_intrinsic_base(intrin);
+
+         b->cursor = nir_before_instr(instr);
+
+         if (src->bit_size == 1)
+            src = nir_b2i32(b, src);
+         if (src->bit_size != 32) {
+            assert(src->bit_size == 16);
+            if (BITSET_TEST(promoted_to_float, nir_intrinsic_base(intrin))) {
+               src = nir_f2f32(b, src);
+            } else {
+               src = nir_u2u32(b, src);
+            }
+         }
+
+         nir_store_uniform_ir3(b, src, .base = offset);
+         nir_instr_remove(instr);
+         nir_instr_free(instr);
+      }
+   }
+
+   /* Now, create the preamble sequence and move the preamble into the main
+    * shader:
+    *
+    * if (preamble_start_ir3()) {
+    *    if (subgroupElect()) {
+    *       preamble();
+    *       preamble_end_ir3();
+    *    }
+    * }
+    * ...
+    */
+
+   b->cursor = nir_before_cf_list(&main->body);
+   
+   nir_if *outer_if = nir_push_if(b, nir_preamble_start_ir3(b, 1));
+   {
+      nir_if *inner_if = nir_push_if(b, nir_elect(b, 1));
+      {
+         nir_call_instr *call = nir_call_instr_create(nir, main->preamble);
+         nir_builder_instr_insert(b, &call->instr);
+         nir_preamble_end_ir3(b);
+      }
+      nir_pop_if(b, inner_if);
+   }
+   nir_pop_if(b, outer_if);
+
+   nir_inline_functions(nir);
+   exec_node_remove(&main->preamble->node);
+   main->preamble = NULL;
+
+   nir_metadata_preserve(main, nir_metadata_none);
+   return true;
+}
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
index e764fa02044..f543d8316d5 100644
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -157,6 +157,7 @@ struct ir3_ubo_analysis_state {
  * that pointer size (ubo, etc) changes depending on generation.
  *
  *    user consts
+ *    preamble consts
  *    UBO addresses
  *    SSBO sizes
  *    image dimensions
@@ -209,6 +210,8 @@ struct ir3_const_state {
    unsigned immediates_size;
    uint32_t *immediates;
 
+   unsigned preamble_size;
+
    /* State of ubo access lowered to push consts: */
    struct ir3_ubo_analysis_state ubo_state;
 };
diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build
index 54d13f50db0..031f0220da9 100644
--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@@ -102,6 +102,7 @@ libfreedreno_ir3_files = files(
   'ir3_nir_lower_tex_prefetch.c',
   'ir3_nir_lower_wide_load_store.c',
   'ir3_nir_move_varying_inputs.c',
+  'ir3_nir_opt_preamble.c',
   'ir3_postsched.c',
   'ir3_print.c',
   'ir3_ra.c',