brw/nir: Treat some ALU results as convergent

v2: Fix for Xe2. v3: Fix handling of 64-bit CMP results. v4: Scalarize 16-bit comparison temporary destination when used as a source (as was already done for 64-bit). Suggested by Ken. shader-db: Lunar Lake total instructions in shared programs: 18096500 -> 18096549 (<.01%) instructions in affected programs: 15919 -> 15968 (0.31%) helped: 8 / HURT: 21 total cycles in shared programs: 921841300 -> 922073090 (0.03%) cycles in affected programs: 115946336 -> 116178126 (0.20%) helped: 386 / HURT: 135 Meteor Lake and DG2 (Meteor Lake shown) total instructions in shared programs: 19836053 -> 19836016 (<.01%) instructions in affected programs: 19547 -> 19510 (-0.19%) helped: 21 / HURT: 18 total cycles in shared programs: 906713777 -> 906588541 (-0.01%) cycles in affected programs: 96914584 -> 96789348 (-0.13%) helped: 335 / HURT: 134 total fills in shared programs: 6712 -> 6710 (-0.03%) fills in affected programs: 52 -> 50 (-3.85%) helped: 1 / HURT: 0 LOST: 1 GAINED: 1 Tiger Lake total instructions in shared programs: 19641284 -> 19641278 (<.01%) instructions in affected programs: 12358 -> 12352 (-0.05%) helped: 10 / HURT: 19 total cycles in shared programs: 865413131 -> 865460513 (<.01%) cycles in affected programs: 74641489 -> 74688871 (0.06%) helped: 388 / HURT: 100 total spills in shared programs: 3899 -> 3898 (-0.03%) spills in affected programs: 17 -> 16 (-5.88%) helped: 1 / HURT: 0 total fills in shared programs: 3249 -> 3245 (-0.12%) fills in affected programs: 51 -> 47 (-7.84%) helped: 1 / HURT: 0 LOST: 1 GAINED: 1 Ice Lake and Skylake had similar results. (Ice Lake shown) total instructions in shared programs: 20495826 -> 20496111 (<.01%) instructions in affected programs: 53220 -> 53505 (0.54%) helped: 28 / HURT: 16 total cycles in shared programs: 875173550 -> 875243910 (<.01%) cycles in affected programs: 51700652 -> 51771012 (0.14%) helped: 400 / HURT: 39 total spills in shared programs: 4546 -> 4546 (0.00%) spills in affected programs: 288 -> 288 (0.00%) helped: 1 / HURT: 2 total fills in shared programs: 5224 -> 5280 (1.07%) fills in affected programs: 795 -> 851 (7.04%) helped: 0 / HURT: 4 LOST: 1 GAINED: 1 fossil-db: Lunar Lake Totals: Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00% Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05% Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04% Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00% Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00% Totals from 6817 (1.24% of 551443) affected shaders: Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05% Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07% Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07% Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01% Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04% Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown) Totals: Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00% Subgroup size: 7733536 -> 7733616 (+0.00%) Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02% Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00% Max dispatch width: 5561696 -> 5561712 (+0.00%) Totals from 5672 (0.90% of 633314) affected shaders: Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07% Subgroup size: 81128 -> 81208 (+0.10%) Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03% Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01% Max dispatch width: 74136 -> 74152 (+0.02%) Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2025-12-25 04:20:08 +01:00 · 2024-01-30 18:53:05 -08:00 · 2024-01-30 18:53:05 -08:00 · c48570d2b2
commit c48570d2b2
parent 7eab2cb67e
1 changed files with 51 additions and 21 deletions
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@ -65,7 +65,7 @@ struct nir_to_brw_state {
 };

 static brw_reg get_nir_src(nir_to_brw_state &ntb, const nir_src &src, int channel = 0);
-static brw_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def);
+static brw_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform = false);
 static nir_component_mask_t get_nir_write_mask(const nir_def &def);

 static void fs_nir_emit_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr);
@ -508,11 +508,10 @@ fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block)
 * match instr.
 */
 static bool
-optimize_extract_to_float(nir_to_brw_state &ntb, nir_alu_instr *instr,
-                          const brw_reg &result)
+optimize_extract_to_float(nir_to_brw_state &ntb, const fs_builder &bld,
+                          nir_alu_instr *instr, const brw_reg &result)
 {
   const intel_device_info *devinfo = ntb.devinfo;
-   const fs_builder &bld = ntb.bld;

   /* No fast path for f16 (yet) or f64. */
   assert(instr->op == nir_op_i2f32 || instr->op == nir_op_u2f32);
@ -736,20 +735,27 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
 {
   const intel_device_info *devinfo = ntb.devinfo;

-   brw_reg result =
-      need_dest ? get_nir_def(ntb, instr->def) : bld.null_reg_ud();
-
-   result.type = brw_type_for_nir_type(devinfo,
-      (nir_alu_type)(nir_op_infos[instr->op].output_type |
-                     instr->def.bit_size));
-
+   bool all_sources_uniform = true;
   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
      op[i] = get_nir_src(ntb, instr->src[i].src, -1);
      op[i].type = brw_type_for_nir_type(devinfo,
         (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
                        nir_src_bit_size(instr->src[i].src)));
+
+      /* is_scalar sources won't be is_uniform because get_nir_src was passed
+       * -1 as the channel.
+       */
+      if (!is_uniform(op[i]) && !op[i].is_scalar)
+         all_sources_uniform = false;
   }

+   brw_reg result =
+      need_dest ? get_nir_def(ntb, instr->def, all_sources_uniform) : bld.null_reg_ud();
+
+   result.type = brw_type_for_nir_type(devinfo,
+      (nir_alu_type)(nir_op_infos[instr->op].output_type |
+                     instr->def.bit_size));
+
   /* Move and vecN instrutions may still be vectored.  Return the raw,
    * vectored source and destination so that fs_visitor::nir_emit_alu can
    * handle it.  Other callers should not have to handle these kinds of
@ -767,6 +773,9 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
      break;
   }

+   const bool is_scalar = result.is_scalar || (!need_dest && all_sources_uniform);
+   const fs_builder xbld = is_scalar ? bld.scalar_group() : bld;
+
   /* At this point, we have dealt with any instruction that operates on
    * more than a single channel.  Therefore, we can just adjust the source
    * and destination registers for that channel and emit the instruction.
@ -780,12 +789,18 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
      assert(util_bitcount(write_mask) == 1);
      channel = ffs(write_mask) - 1;

-      result = offset(result, bld, channel);
+      result = offset(result, xbld, channel);
   }

   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
      assert(nir_op_infos[instr->op].input_sizes[i] < 2);
-      op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
+      op[i] = offset(op[i], xbld, instr->src[i].swizzle[channel]);
+
+      /* If the dispatch width matches the scalar allocation width, offset()
+       * won't set the stride to zero. Force that here.
+       */
+      if (op[i].is_scalar)
+         op[i] = component(op[i], 0);
   }

   return result;
@ -867,14 +882,13 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
                bool need_dest)
 {
   const intel_device_info *devinfo = ntb.devinfo;
-   const fs_builder &bld = ntb.bld;

   fs_inst *inst;
   unsigned execution_mode =
-      bld.shader->nir->info.float_controls_execution_mode;
+      ntb.bld.shader->nir->info.float_controls_execution_mode;

   brw_reg op[NIR_MAX_VEC_COMPONENTS];
-   brw_reg result = prepare_alu_destination_and_sources(ntb, bld, instr, op, need_dest);
+   brw_reg result = prepare_alu_destination_and_sources(ntb, ntb.bld, instr, op, need_dest);

 #ifndef NDEBUG
   /* Everything except raw moves, some type conversions, iabs, and ineg
@ -909,6 +923,8 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
   }
 #endif

+   const fs_builder &bld = result.is_scalar ? ntb.bld.scalar_group() : ntb.bld;
+
   switch (instr->op) {
   case nir_op_mov:
   case nir_op_vec2:
@ -976,7 +992,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,

   case nir_op_i2f32:
   case nir_op_u2f32:
-      if (optimize_extract_to_float(ntb, instr, result))
+      if (optimize_extract_to_float(ntb, bld, instr, result))
         return;
      bld.MOV(result, op[0]);
      break;
@ -1056,7 +1072,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
      if (extract_instr != NULL) {
         if (extract_instr->op == nir_op_extract_u8 ||
             extract_instr->op == nir_op_extract_i8) {
-            prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
+            prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false);

            const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
            const brw_reg_type type =
@ -1065,7 +1081,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
            op[0] = subscript(op[0], type, byte);
         } else if (extract_instr->op == nir_op_extract_u16 ||
                    extract_instr->op == nir_op_extract_i16) {
-            prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
+            prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false);

            const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
            const brw_reg_type type =
@ -1302,6 +1318,12 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,

      bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op));

+      /* The destination will now be used as a source, so select component 0
+       * if it's is_scalar (as is done in get_nir_src).
+       */
+      if (bit_size != 32 && result.is_scalar)
+         dest = component(dest, 0);
+
      if (bit_size > 32) {
         bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0));
      } else if(bit_size < 32) {
@ -1333,6 +1355,12 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
      bld.CMP(dest, op[0], op[1],
              brw_cmod_for_nir_comparison(instr->op));

+      /* The destination will now be used as a source, so select component 0
+       * if it's is_scalar (as is done in get_nir_src).
+       */
+      if (bit_size != 32 && result.is_scalar)
+         dest = component(dest, 0);
+
      if (bit_size > 32) {
         bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0));
      } else if (bit_size < 32) {
@ -1357,7 +1385,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
         /* The sources of the source logical instruction are now the
          * sources of the instruction that will be generated.
          */
-         prepare_alu_destination_and_sources(ntb, bld, inot_src_instr, op, false);
+         prepare_alu_destination_and_sources(ntb, ntb.bld, inot_src_instr, op, false);
         resolve_inot_sources(ntb, bld, inot_src_instr, op);

         /* Smash all of the sources and destination to be signed.  This
@ -1938,7 +1966,7 @@ get_nir_src_imm(nir_to_brw_state &ntb, const nir_src &src)
 }

 static brw_reg
-get_nir_def(nir_to_brw_state &ntb, const nir_def &def)
+get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform)
 {
   nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
   bool is_scalar = false;
@ -1963,6 +1991,8 @@ get_nir_def(nir_to_brw_state &ntb, const nir_def &def)

      /* This cannot be is_scalar if NIR thought it was divergent. */
      assert(!(is_scalar && def.divergent));
+   } else if (def.parent_instr->type == nir_instr_type_alu) {
+      is_scalar = store_reg == NULL && all_sources_uniform && !def.divergent;
   }

   const fs_builder &bld = is_scalar ? ntb.bld.scalar_group() : ntb.bld;