From 3fcdd9e4a7a63a9a6a1f72c2c459e02acba7f578 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= <pavel.ondracka@gmail.com>
Date: Thu, 24 Nov 2022 09:24:35 +0100
Subject: [PATCH] nir/lower_bool: ntt: Generate a good opcode for bcsel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is heavily copy-pasted from a patch of Ian Romanick, including the
commit message.

Previously, this pass always generated fcsel for bcsel.  This was the
only place that generate fcsel, so various drivers assumed (and needed!)
that src0 was a Boolean with 0.0 or 1.0 as the only values.

Specifically, many DX9 / GL_ARB_vertex_program platforms lack a CMP
instruction in vertex shaders.  In those cases, they would use LRP to
implement fcsel.  The bummer is that many plaforms have a real fcsel
instruction, and those platforms would benefit from other places
generating that opcode.

Instead of leaving assumptions in drivers about the sources of an opcode
that they can't really support, allow them to control the way the
lowering pass translates bcsel.  Two flags are used to control this:

- If the driver sets has_fused_comp_and_csel in nir_options, fcsel_gt
  will be used.  Since the Boolean value is 0.0 or 1.0, this is
  equivalent to fcsel.

- If the parameter has_fcsel_ne is set, fcsel will be used.  This is the
  old path.

- Otherwise, the lowering pass assumes we're on a crufty, old DX9 vertex
  program, and it emits flrp.

With this, the assumptions about src0 of fcsel in NTT can be removed.
If a platform can't handle fcsel, it should ensure that the lowering
pass won't generate it.

No change in shader-db.

Signed-off-by: Pavel Ondračka <pavel.ondracka@gmail.com>
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20162>
---
 src/compiler/nir/nir.h                        |  2 +-
 src/compiler/nir/nir_lower_bool_to_float.c    | 41 ++++++++++++++++---
 src/gallium/auxiliary/nir/nir_to_tgsi.c       | 28 ++++---------
 .../drivers/etnaviv/etnaviv_compiler_nir.c    |  2 +-
 src/gallium/drivers/freedreno/a2xx/ir2_nir.c  |  2 +-
 src/gallium/drivers/lima/lima_program.c       |  4 +-
 6 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 736b20c261b..8ad4b11768c 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -4972,7 +4972,7 @@ bool nir_scale_fdiv(nir_shader *shader);
 bool nir_lower_alu_to_scalar(nir_shader *shader, nir_instr_filter_cb cb, const void *data);
 bool nir_lower_alu_width(nir_shader *shader, nir_vectorize_cb cb, const void *data);
 bool nir_lower_bool_to_bitsize(nir_shader *shader);
-bool nir_lower_bool_to_float(nir_shader *shader);
+bool nir_lower_bool_to_float(nir_shader *shader, bool has_fcsel_ne);
 bool nir_lower_bool_to_int32(nir_shader *shader);
 bool nir_opt_simplify_convert_alu_types(nir_shader *shader);
 bool nir_lower_const_arrays_to_uniforms(nir_shader *shader,
diff --git a/src/compiler/nir/nir_lower_bool_to_float.c b/src/compiler/nir/nir_lower_bool_to_float.c
index 593a9efe625..8ede52b3287 100644
--- a/src/compiler/nir/nir_lower_bool_to_float.c
+++ b/src/compiler/nir/nir_lower_bool_to_float.c
@@ -43,7 +43,8 @@ rewrite_1bit_ssa_def_to_32bit(nir_ssa_def *def, void *_progress)
 }
 
 static bool
-lower_alu_instr(nir_builder *b, nir_alu_instr *alu)
+lower_alu_instr(nir_builder *b, nir_alu_instr *alu, bool has_fcsel_ne,
+                bool has_fcsel_gt)
 {
    const nir_op_info *op_info = &nir_op_infos[alu->op];
 
@@ -96,7 +97,22 @@ lower_alu_instr(nir_builder *b, nir_alu_instr *alu)
    case nir_op_bany_inequal3: alu->op = nir_op_fany_nequal3; break;
    case nir_op_bany_inequal4: alu->op = nir_op_fany_nequal4; break;
 
-   case nir_op_bcsel: alu->op = nir_op_fcsel; break;
+   case nir_op_bcsel:
+      if (has_fcsel_gt)
+         alu->op = nir_op_fcsel_gt;
+      else if (has_fcsel_ne)
+         alu->op = nir_op_fcsel;
+      else {
+         /* Only a few pre-VS 4.0 platforms (e.g., r300 vertex shaders) should
+          * hit this path.
+          */
+         rep = nir_flrp(b,
+                        nir_ssa_for_alu_src(b, alu, 2),
+                        nir_ssa_for_alu_src(b, alu, 1),
+                        nir_ssa_for_alu_src(b, alu, 0));
+      }
+
+      break;
 
    case nir_op_iand: alu->op = nir_op_fmul; break;
    case nir_op_ixor: alu->op = nir_op_sne; break;
@@ -138,14 +154,22 @@ lower_tex_instr(nir_tex_instr *tex)
    return progress;
 }
 
+struct lower_bool_to_float_data {
+   bool has_fcsel_ne;
+   bool has_fcsel_gt;
+};
+
 static bool
 nir_lower_bool_to_float_instr(nir_builder *b,
                               nir_instr *instr,
-                              UNUSED void *cb_data)
+                              void *cb_data)
 {
+   struct lower_bool_to_float_data *data = cb_data;
+
    switch (instr->type) {
    case nir_instr_type_alu:
-      return lower_alu_instr(b, nir_instr_as_alu(instr));
+      return lower_alu_instr(b, nir_instr_as_alu(instr),
+                             data->has_fcsel_ne, data->has_fcsel_gt);
 
    case nir_instr_type_load_const: {
       nir_load_const_instr *load = nir_instr_as_load_const(instr);
@@ -177,10 +201,15 @@ nir_lower_bool_to_float_instr(nir_builder *b,
 }
 
 bool
-nir_lower_bool_to_float(nir_shader *shader)
+nir_lower_bool_to_float(nir_shader *shader, bool has_fcsel_ne)
 {
+   struct lower_bool_to_float_data data = {
+      .has_fcsel_ne = has_fcsel_ne,
+      .has_fcsel_gt = shader->options->has_fused_comp_and_csel
+   };
+
    return nir_shader_instructions_pass(shader, nir_lower_bool_to_float_instr,
                                        nir_metadata_block_index |
                                        nir_metadata_dominance,
-                                       NULL);
+                                       &data);
 }
diff --git a/src/gallium/auxiliary/nir/nir_to_tgsi.c b/src/gallium/auxiliary/nir/nir_to_tgsi.c
index f3d69c09ed0..08d19fced26 100644
--- a/src/gallium/auxiliary/nir/nir_to_tgsi.c
+++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c
@@ -1645,26 +1645,13 @@ ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
          break;
 
       case nir_op_fcsel:
-         /* NIR fcsel is src0 != 0 ? src1 : src2.
-          * TGSI CMP is src0 < 0 ? src1 : src2.
-          *
-          * However, fcsel so far as I can find only appears on bools-as-floats
-          * (1.0 or 0.0), so we can just negate it for the TGSI op.  It's
-          * important to not have an abs here, as i915g has to make extra
-          * instructions to do the abs.
+         /* If CMP isn't supported, then the flags that enable NIR to generate
+          * this opcode should also not be set.
           */
-         if (c->options->lower_cmp) {
-            /* If the HW doesn't support TGSI CMP (r300 VS), then lower it to a
-             * LRP on the boolean 1.0/0.0 value, instead of requiring the
-             * backend to turn the src0 into 1.0/0.0 first.
-             *
-             * We don't use this in general because some hardware (i915 FS) the
-             * LRP gets expanded to MUL/MAD.
-             */
-            ntt_LRP(c, dst, src[0], src[1], src[2]);
-         } else {
-            ntt_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]);
-         }
+         assert(!c->options->lower_cmp);
+
+         /* Implement this as CMP(-abs(src0), src1, src2). */
+         ntt_CMP(c, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]);
          break;
 
       case nir_op_fcsel_gt:
@@ -3942,7 +3929,8 @@ const void *nir_to_tgsi_options(struct nir_shader *s,
       NIR_PASS_V(s, nir_lower_bool_to_int32);
    } else {
       NIR_PASS_V(s, nir_lower_int_to_float);
-      NIR_PASS_V(s, nir_lower_bool_to_float);
+      NIR_PASS_V(s, nir_lower_bool_to_float,
+                 !options->lower_cmp && !options->lower_fabs);
       /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
       NIR_PASS_V(s, nir_copy_prop);
       NIR_PASS_V(s, nir_opt_dce);
diff --git a/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c b/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c
index e64eb8b2811..b4ec5071795 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c
@@ -1158,7 +1158,7 @@ etna_compile_shader(struct etna_shader_variant *v)
        */
       NIR_PASS_V(s, nir_lower_int_to_float);
       NIR_PASS_V(s, nir_opt_algebraic);
-      NIR_PASS_V(s, nir_lower_bool_to_float);
+      NIR_PASS_V(s, nir_lower_bool_to_float, true);
    } else {
       NIR_PASS_V(s, nir_lower_bool_to_int32);
    }
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
index 0218975625d..9aa23b8b976 100644
--- a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
@@ -1111,7 +1111,7 @@ ir2_nir_compile(struct ir2_context *ctx, bool binning)
    OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons);
 
    OPT_V(ctx->nir, nir_lower_int_to_float);
-   OPT_V(ctx->nir, nir_lower_bool_to_float);
+   OPT_V(ctx->nir, nir_lower_bool_to_float, true);
    while (OPT(ctx->nir, nir_opt_algebraic))
       ;
    OPT_V(ctx->nir, nir_opt_algebraic_late);
diff --git a/src/gallium/drivers/lima/lima_program.c b/src/gallium/drivers/lima/lima_program.c
index 0c6b3c0f550..418808afcba 100644
--- a/src/gallium/drivers/lima/lima_program.c
+++ b/src/gallium/drivers/lima/lima_program.c
@@ -147,7 +147,7 @@ lima_program_optimize_vs_nir(struct nir_shader *s)
    NIR_PASS_V(s, nir_lower_int_to_float);
    /* int_to_float pass generates ftrunc, so lower it */
    NIR_PASS(progress, s, lima_nir_lower_ftrunc);
-   NIR_PASS_V(s, nir_lower_bool_to_float);
+   NIR_PASS_V(s, nir_lower_bool_to_float, true);
 
    NIR_PASS_V(s, nir_copy_prop);
    NIR_PASS_V(s, nir_opt_dce);
@@ -255,7 +255,7 @@ lima_program_optimize_fs_nir(struct nir_shader *s,
    } while (progress);
 
    NIR_PASS_V(s, nir_lower_int_to_float);
-   NIR_PASS_V(s, nir_lower_bool_to_float);
+   NIR_PASS_V(s, nir_lower_bool_to_float, true);
 
    /* Some ops must be lowered after being converted from int ops,
     * so re-run nir_opt_algebraic after int lowering. */