From c48570d2b266970e15960e8dd0c5ba1b94d606a0 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Tue, 30 Jan 2024 18:53:05 -0800
Subject: [PATCH] brw/nir: Treat some ALU results as convergent

v2: Fix for Xe2.

v3: Fix handling of 64-bit CMP results.

v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.

shader-db:

Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21

total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135

Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18

total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134

total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0

LOST:   1
GAINED: 1

Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19

total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100

total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0

total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0

LOST:   1
GAINED: 1

Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16

total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39

total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2

total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4

LOST:   1
GAINED: 1

fossil-db:

Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%

Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%

Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)

Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
---
 src/intel/compiler/brw_fs_nir.cpp | 72 ++++++++++++++++++++++---------
 1 file changed, 51 insertions(+), 21 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 257dfd12011..5ef888f0040 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -65,7 +65,7 @@ struct nir_to_brw_state {
 };
 
 static brw_reg get_nir_src(nir_to_brw_state &ntb, const nir_src &src, int channel = 0);
-static brw_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def);
+static brw_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform = false);
 static nir_component_mask_t get_nir_write_mask(const nir_def &def);
 
 static void fs_nir_emit_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr);
@@ -508,11 +508,10 @@ fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block)
  * match instr.
  */
 static bool
-optimize_extract_to_float(nir_to_brw_state &ntb, nir_alu_instr *instr,
-                          const brw_reg &result)
+optimize_extract_to_float(nir_to_brw_state &ntb, const fs_builder &bld,
+                          nir_alu_instr *instr, const brw_reg &result)
 {
    const intel_device_info *devinfo = ntb.devinfo;
-   const fs_builder &bld = ntb.bld;
 
    /* No fast path for f16 (yet) or f64. */
    assert(instr->op == nir_op_i2f32 || instr->op == nir_op_u2f32);
@@ -736,20 +735,27 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
 {
    const intel_device_info *devinfo = ntb.devinfo;
 
-   brw_reg result =
-      need_dest ? get_nir_def(ntb, instr->def) : bld.null_reg_ud();
-
-   result.type = brw_type_for_nir_type(devinfo,
-      (nir_alu_type)(nir_op_infos[instr->op].output_type |
-                     instr->def.bit_size));
-
+   bool all_sources_uniform = true;
    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
       op[i] = get_nir_src(ntb, instr->src[i].src, -1);
       op[i].type = brw_type_for_nir_type(devinfo,
          (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
                         nir_src_bit_size(instr->src[i].src)));
+
+      /* is_scalar sources won't be is_uniform because get_nir_src was passed
+       * -1 as the channel.
+       */
+      if (!is_uniform(op[i]) && !op[i].is_scalar)
+         all_sources_uniform = false;
    }
 
+   brw_reg result =
+      need_dest ? get_nir_def(ntb, instr->def, all_sources_uniform) : bld.null_reg_ud();
+
+   result.type = brw_type_for_nir_type(devinfo,
+      (nir_alu_type)(nir_op_infos[instr->op].output_type |
+                     instr->def.bit_size));
+
    /* Move and vecN instrutions may still be vectored.  Return the raw,
     * vectored source and destination so that fs_visitor::nir_emit_alu can
     * handle it.  Other callers should not have to handle these kinds of
@@ -767,6 +773,9 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
       break;
    }
 
+   const bool is_scalar = result.is_scalar || (!need_dest && all_sources_uniform);
+   const fs_builder xbld = is_scalar ? bld.scalar_group() : bld;
+
    /* At this point, we have dealt with any instruction that operates on
     * more than a single channel.  Therefore, we can just adjust the source
     * and destination registers for that channel and emit the instruction.
@@ -780,12 +789,18 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
       assert(util_bitcount(write_mask) == 1);
       channel = ffs(write_mask) - 1;
 
-      result = offset(result, bld, channel);
+      result = offset(result, xbld, channel);
    }
 
    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
-      op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
+      op[i] = offset(op[i], xbld, instr->src[i].swizzle[channel]);
+
+      /* If the dispatch width matches the scalar allocation width, offset()
+       * won't set the stride to zero. Force that here.
+       */
+      if (op[i].is_scalar)
+         op[i] = component(op[i], 0);
    }
 
    return result;
@@ -867,14 +882,13 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
                 bool need_dest)
 {
    const intel_device_info *devinfo = ntb.devinfo;
-   const fs_builder &bld = ntb.bld;
 
    fs_inst *inst;
    unsigned execution_mode =
-      bld.shader->nir->info.float_controls_execution_mode;
+      ntb.bld.shader->nir->info.float_controls_execution_mode;
 
    brw_reg op[NIR_MAX_VEC_COMPONENTS];
-   brw_reg result = prepare_alu_destination_and_sources(ntb, bld, instr, op, need_dest);
+   brw_reg result = prepare_alu_destination_and_sources(ntb, ntb.bld, instr, op, need_dest);
 
 #ifndef NDEBUG
    /* Everything except raw moves, some type conversions, iabs, and ineg
@@ -909,6 +923,8 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
    }
 #endif
 
+   const fs_builder &bld = result.is_scalar ? ntb.bld.scalar_group() : ntb.bld;
+
    switch (instr->op) {
    case nir_op_mov:
    case nir_op_vec2:
@@ -976,7 +992,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
 
    case nir_op_i2f32:
    case nir_op_u2f32:
-      if (optimize_extract_to_float(ntb, instr, result))
+      if (optimize_extract_to_float(ntb, bld, instr, result))
          return;
       bld.MOV(result, op[0]);
       break;
@@ -1056,7 +1072,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
       if (extract_instr != NULL) {
          if (extract_instr->op == nir_op_extract_u8 ||
              extract_instr->op == nir_op_extract_i8) {
-            prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
+            prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false);
 
             const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
             const brw_reg_type type =
@@ -1065,7 +1081,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
             op[0] = subscript(op[0], type, byte);
          } else if (extract_instr->op == nir_op_extract_u16 ||
                     extract_instr->op == nir_op_extract_i16) {
-            prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
+            prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false);
 
             const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
             const brw_reg_type type =
@@ -1302,6 +1318,12 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
 
       bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op));
 
+      /* The destination will now be used as a source, so select component 0
+       * if it's is_scalar (as is done in get_nir_src).
+       */
+      if (bit_size != 32 && result.is_scalar)
+         dest = component(dest, 0);
+
       if (bit_size > 32) {
          bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0));
       } else if(bit_size < 32) {
@@ -1333,6 +1355,12 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
       bld.CMP(dest, op[0], op[1],
               brw_cmod_for_nir_comparison(instr->op));
 
+      /* The destination will now be used as a source, so select component 0
+       * if it's is_scalar (as is done in get_nir_src).
+       */
+      if (bit_size != 32 && result.is_scalar)
+         dest = component(dest, 0);
+
       if (bit_size > 32) {
          bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0));
       } else if (bit_size < 32) {
@@ -1357,7 +1385,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
          /* The sources of the source logical instruction are now the
           * sources of the instruction that will be generated.
           */
-         prepare_alu_destination_and_sources(ntb, bld, inot_src_instr, op, false);
+         prepare_alu_destination_and_sources(ntb, ntb.bld, inot_src_instr, op, false);
          resolve_inot_sources(ntb, bld, inot_src_instr, op);
 
          /* Smash all of the sources and destination to be signed.  This
@@ -1938,7 +1966,7 @@ get_nir_src_imm(nir_to_brw_state &ntb, const nir_src &src)
 }
 
 static brw_reg
-get_nir_def(nir_to_brw_state &ntb, const nir_def &def)
+get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform)
 {
    nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
    bool is_scalar = false;
@@ -1963,6 +1991,8 @@ get_nir_def(nir_to_brw_state &ntb, const nir_def &def)
 
       /* This cannot be is_scalar if NIR thought it was divergent. */
       assert(!(is_scalar && def.divergent));
+   } else if (def.parent_instr->type == nir_instr_type_alu) {
+      is_scalar = store_reg == NULL && all_sources_uniform && !def.divergent;
    }
 
    const fs_builder &bld = is_scalar ? ntb.bld.scalar_group() : ntb.bld;